PDL-Stats-0.855/0000755000175000017500000000000014762214364013205 5ustar osboxesosboxesPDL-Stats-0.855/META.json0000644000175000017500000000325114762214364014627 0ustar osboxesosboxes{ "abstract" : "a collection of statistics modules in Perl Data Language, with a quick-start guide for non-PDL people.", "author" : [ "Maggie J. Xiong " ], "dynamic_config" : 1, "generated_by" : "ExtUtils::MakeMaker version 7.7106, CPAN::Meta::Converter version 2.150010", "license" : [ "perl_5" ], "meta-spec" : { "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", "version" : 2 }, "name" : "PDL-Stats", "no_index" : { "directory" : [ "t", "inc" ] }, "prereqs" : { "build" : { "requires" : { "PDL" : "2.099" } }, "configure" : { "requires" : { "PDL" : "2.099" } }, "develop" : { "requires" : { "CPAN::Changes" : "0" } }, "runtime" : { "recommends" : { "PDL::GSL" : "0", "PDL::Graphics::Simple" : "0" }, "requires" : { "PDL" : "2.099", "perl" : "5.016" } }, "test" : { "requires" : { "Test::More" : "0.88", "Test::PDL" : "0.21" } } }, "release_status" : "stable", "resources" : { "bugtracker" : { "web" : "https://github.com/PDLPorters/PDL-Stats/issues" }, "repository" : { "type" : "git", "url" : "git://github.com/PDLPorters/PDL-Stats.git", "web" : "https://github.com/PDLPorters/PDL-Stats" }, "x_IRC" : "irc://irc.perl.org/#pdl" }, "version" : "0.855", "x_serialization_backend" : "JSON::PP version 4.04" } PDL-Stats-0.855/t/0000755000175000017500000000000014762214364013450 5ustar osboxesosboxesPDL-Stats-0.855/t/glm.t0000644000175000017500000011266414762212775014432 0ustar osboxesosboxesuse strict; use warnings; use Test::More; use PDL::Stats::Basic; use PDL::Stats::GLM; use PDL::LiteF; use Test::PDL; is_pdl pdl('BAD 1 2 3 4')->fill_m, pdl('2.5 1 2 3 4'), "fill_m replaces bad values with sample mean"; { my $stdv = pdl('BAD 1 2 3 4')->fill_rand->stdv; ok PDL::Core::approx( $stdv, 1.01980390271856 ) || PDL::Core::approx( $stdv, 1.16619037896906 ), "fill_rand replaces bad values with random sample of good values from same variable"; } my $a = sequence 5; is_pdl $a->dev_m, pdl('-2 -1 0 1 2'), "dev_m replaces values with deviations from the mean on $a"; is_pdl $a->stddz, pdl('-1.41421356237309 -0.707106781186547 0 0.707106781186547 1.41421356237309'), "stddz standardizes data on $a"; my $b = pdl(0, 0, 0, 1, 1); is_pdl $a->sse($b), pdl(18), "sse gives sum of squared errors between actual and predicted values between $a and $b"; is_pdl $a->mse($b), pdl(3.6), "mse gives mean of squared errors between actual and predicted values between $a and $b"; is_pdl $a->rmse($b), pdl(1.89736659610103), "rmse gives root mean squared error, ie. stdv around predicted value between $a and $b"; is_pdl $b->glue(1,ones(5))->pred_logistic(pdl(1,2)), pdl('0.880797077977882 0.880797077977882 0.880797077977882 0.952574126822433 0.952574126822433'), "pred_logistic calculates predicted probability value for logistic regression"; my $y = pdl(0, 1, 0, 1, 0); is_pdl $y->d0(), pdl( 6.73011667009256 ), 'd0'; is_pdl $y->dm( ones(5) * .5 ), pdl( 6.93147180559945 ), 'dm'; is_pdl $y->dvrs(ones(5) * .5) ** 2, pdl('1.38629436111989 1.38629436111989 1.38629436111989 1.38629436111989 1.38629436111989'), 'dvrs'; { my $a = pdl(ushort, [0,0,1,0,1], [0,0,0,1,1] ); my $b = cat sequence(5), sequence(5)**2; $b = cat $b, $b * 2; my %m = $a->ols_t($b->dummy(2)); my $rsq = pdl('0.33333333 0.80952381; 0.33333333 0.80952381'); my $coeff = pdl(' [0 0.2 0; -0.057142 0.014285 0.071428] [0 0.1 0; -0.057142 0.007142 0.035714] '); is_pdl $m{R2}, $rsq, 'ols_t R2'; is_pdl $m{b}, $coeff, 'ols_t b'; my %m0 = $a->ols_t(sequence(5), {CONST=>0}); is_pdl $m0{b}, pdl('0.2; 0.23333333'), 'ols_t, const=>0'; } { my $a = sequence 5; my $b = pdl(0,0,0,1,1); my %m = $a->ols($b, {plot=>0}); my %a = ( F => 9, F_df => pdl(1,3), R2 => .75, b => pdl(1, 2.5), b_se => pdl(0.52704628, 0.83333333), b_t => pdl(1.8973666, 3), ss_total => 10, ss_model => 7.5, ); test_stats_cmp(\%m, \%a); } { my $a = pdl '0 1 2 3 4 BAD'; my $b = pdl(0,0,0,1,1,1); my %m = $a->ols($b, {plot=>0}); is_pdl $b, pdl(0,0,0,1,1,1), "ols with bad value didn't change caller value"; ok $a->check_badflag, "ols with bad value didn't remove caller bad flag"; my %a = ( F => 9, F_df => pdl(1,3), R2 => .75, b => pdl(1, 2.5), b_se => pdl(0.52704628, 0.83333333), b_t => pdl(1.8973666, 3), ss_total => 10, ss_model => 7.5, ); test_stats_cmp(\%m, \%a); } { my $a = sequence 5, 2; my $b = pdl(0,0,0,1,1); my $c = pdl(0,0,2,2,2); my %m = $a->r2_change( $b, cat $b, $c ); my %a = ( F_change => pdl(3, 3), F_df => pdl(1, 2), R2_change => pdl(.15, .15), ); test_stats_cmp(\%m, \%a); } { # This is the example from Lorch and Myers (1990), # a study on how characteristics of sentences affected reading time # Three within-subject IVs: # SP -- serial position of sentence # WORDS -- number of words in sentence # NEW -- number of new arguments in sentence my $lorch_data = <<'EOF'; Snt Sp Wrds New subj DV 1 1 13 1 1 3.429 2 2 16 3 1 6.482 3 3 9 2 1 1.714 4 4 9 2 1 3.679 5 5 10 3 1 4.000 6 6 18 4 1 6.973 7 7 6 1 1 2.634 1 1 13 1 2 2.795 2 2 16 3 2 5.411 3 3 9 2 2 2.339 4 4 9 2 2 3.714 5 5 10 3 2 2.902 6 6 18 4 2 8.018 7 7 6 1 2 1.750 1 1 13 1 3 4.161 2 2 16 3 3 4.491 3 3 9 2 3 3.018 4 4 9 2 3 2.866 5 5 10 3 3 2.991 6 6 18 4 3 6.625 7 7 6 1 3 2.268 1 1 13 1 4 3.071 2 2 16 3 4 5.063 3 3 9 2 4 2.464 4 4 9 2 4 2.732 5 5 10 3 4 2.670 6 6 18 4 4 7.571 7 7 6 1 4 2.884 1 1 13 1 5 3.625 2 2 16 3 5 9.295 3 3 9 2 5 6.045 4 4 9 2 5 4.205 5 5 10 3 5 3.884 6 6 18 4 5 8.795 7 7 6 1 5 3.491 1 1 13 1 6 3.161 2 2 16 3 6 5.643 3 3 9 2 6 2.455 4 4 9 2 6 6.241 5 5 10 3 6 3.223 6 6 18 4 6 13.188 7 7 6 1 6 3.688 1 1 13 1 7 3.232 2 2 16 3 7 8.357 3 3 9 2 7 4.920 4 4 9 2 7 3.723 5 5 10 3 7 3.143 6 6 18 4 7 11.170 7 7 6 1 7 2.054 1 1 13 1 8 7.161 2 2 16 3 8 4.313 3 3 9 2 8 3.366 4 4 9 2 8 6.330 5 5 10 3 8 6.143 6 6 18 4 8 6.071 7 7 6 1 8 1.696 1 1 13 1 9 1.536 2 2 16 3 9 2.946 3 3 9 2 9 1.375 4 4 9 2 9 1.152 5 5 10 3 9 2.759 6 6 18 4 9 7.964 7 7 6 1 9 1.455 1 1 13 1 10 4.063 2 2 16 3 10 6.652 3 3 9 2 10 2.179 4 4 9 2 10 3.661 5 5 10 3 10 3.330 6 6 18 4 10 7.866 7 7 6 1 10 3.705 EOF open my $fh, '<', \$lorch_data or die "Couldn't open scalar: $!"; my ($data, $idv, $ido) = rtable $fh, {V=>0}; my %r = $data->slice(',(4)')->ols_rptd( $data->t->using(3,0,1,2) ); #print "\n$_\t$r{$_}\n" for sort keys %r; test_stats_cmp(\%r, { ss_total => pdl(405.188241771429), ss_residual => pdl(58.3754646504336), ss_subject => pdl(51.8590337714289), ss => pdl(18.450705, 73.813294, 0.57026483), ss_err => pdl(23.036272, 10.827623, 5.0104731), coeff => pdl(0.33337285, 0.45858933, 0.15162986), F => pdl(7.208473, 61.354153, 1.0243311), }); } { # pca my $a = pdl ( [qw(1 3 6 6 8)], [qw(1 4 6 8 9)], [qw(0 2 2 4 9)], ); my %p = $a->pca({CORR=>1, PLOT=>0}); my %a = ( eigenvalue => float( qw( 2.786684 0.18473727 0.028578689) ), # loadings in R eigenvector => [float( # v1 v2 v3 [qw( 0.58518141 0.58668657 0.55978709)], # comp1 [qw( -0.41537629 -0.37601061 0.82829859)], # comp2 [qw( -0.69643754 0.71722722 -0.023661276)], # comp3 ), \&PDL::abs], loadings => [float( [qw( 0.97686463 0.97937725 0.93447296)], [qw( -0.17853319 -0.1616134 0.35601163)], [qw( -0.11773439 0.12124893 -0.0039999937)], ), \&PDL::abs], pct_var => pdl( qw(0.92889468 0.06157909 0.0095262297) ), ); test_stats_cmp(\%p, \%a, 1e-5); %p = $a->pca({CORR=>0, PLOT=>0}); %a = ( eigenvalue => [float(qw[ 22.0561695 1.581758022 0.202065959 ]), \&PDL::abs], eigenvector => [float( [qw(-0.511688 -0.595281 -0.619528)], [qw( 0.413568 0.461388 -0.78491)], [qw( 0.753085 -0.657846 0.0101023)], ), \&PDL::abs], loadings => [pdl( [qw(-0.96823408 -0.9739215 -0.94697802)], [qw( 0.20956865 0.20214966 -0.32129495)], [qw( 0.13639532 -0.10301693 0.001478041)], ), \&PDL::abs], pct_var => pdl( qw[0.925175 0.0663489 0.00847592] ), ); test_stats_cmp(\%p, \%a, 1e-4); } { # pca_sorti - principal component analysis output sorted to find which vars a component is best represented my $a = pdl ' 0 1 2 3 4 5 6 0 8 9; 10 11 12 13 0 15 16 17 18 19; 20 0 22 23 24 25 26 27 0 29; 30 31 32 33 34 0 36 37 38 39; 40 41 0 43 44 45 46 47 48 0 '; my %m = $a->pca({PLOT=>0}); my %exp = ( eigenvalue => [float('1.59695565700531 1.17390930652618 1.05055177211761 0.60359400510788 0.574989080429077'), \&PDL::abs], eigenvector => [float(' 0.576511 0.538729 0.213031 0.232488 0.527233; 0.237671 0.290144 0.792538 0.305884 0.371008; 0.279697 0.367018 0.0690239 0.811981 0.350699; 0.707315 0.697799 0.0780877 0.0814506 0.00705761; 0.180619 0.062942 0.561818 0.431784 0.679219 '), \&PDL::abs], loadings => [my $loadings = pdl(' 0.72854035 0.68079491 0.2692094 0.29379663 -0.66626785; -0.25750895 -0.31436277 0.85869179 -0.33141797 -0.4019762; -0.28667967 -0.37618117 -0.07074587 0.83225134 -0.35945416; -0.54952221 0.54212932 0.060667887 0.063280383 0.0054838393; 0.13696062 0.047727115 0.42601542 0.32741254 0.51503877 '), \&PDL::abs], pct_var => pdl('0.319391131401062 0.234781861305237 0.210110354423523 0.120718801021576 0.114997816085815'), ); test_stats_cmp(\%m, \%exp, 1e-4); is_pdl $m{loadings}, $loadings, {require_equal_types=>0}; my ($iv, $ic) = $loadings->pca_sorti; is_pdl $iv, indx(qw(0 1 4 2 3)); is_pdl $ic, pdl(qw( 0 1 2 )); } SKIP: { eval { require PDL::Fit::LM; }; skip 'no PDL::Fit::LM', 1 if $@; my $y = pdl( 0, 0, 0, 1, 1 ); my $x = pdl(2, 3, 5, 5, 5); my %m = $y->logistic( $x ); my $y_pred = $x->glue(1, ones(5))->pred_logistic( $m{b} ); my $y_pred_ans = pdl qw(7.2364053e-07 0.00010154254 0.66666667 0.66666667 0.66666667); is_pdl $y_pred, $y_pred_ans; is_pdl $m{Dm_chisq}, pdl 2.91082711764867; %m = $y->logistic( $x, {COV=>1} ); isnt $m{cov}, undef, 'get cov from logistic if ask'; }; my $a_bad = pdl '0 1 2 3 4 BAD'; my $b_bad = pdl 'BAD 0 0 0 1 1'; is_pdl $a_bad->dev_m, pdl( '-2 -1 0 1 2 BAD' ), "dev_m with bad values $a_bad"; is_pdl $a_bad->stddz, pdl( '-1.41421356237309 -0.707106781186547 0 0.707106781186547 1.41421356237309 BAD' ), "stdz with bad values $a_bad"; is_pdl $a_bad->sse($b_bad), pdl(23), "sse with bad values between $a_bad and $b_bad"; is_pdl $a_bad->mse($b_bad), pdl(5.75), "mse with badvalues between $a_bad and $b_bad"; is_pdl $a_bad->rmse($b_bad), pdl( 2.39791576165636 ), "rmse with bad values between $a_bad and $b_bad"; is_pdl $b_bad->glue(1,ones(6))->pred_logistic(pdl(1,2)), pdl( 'BAD 0.880797077977882 0.880797077977882 0.880797077977882 0.952574126822433 0.952574126822433' ), "pred_logistic with bad values"; is_pdl $b_bad->d0(), pdl( 6.73011667009256 ), "null deviance with bad values on $b_bad"; is_pdl $b_bad->dm( ones(6) * .5 ), pdl( 6.93147180559945 ), "model deviance with bad values on $b_bad"; is_pdl $b_bad->dvrs(ones(6) * .5), pdl( 'BAD -1.17741002251547 -1.17741002251547 -1.17741002251547 1.17741002251547 1.17741002251547' ), "deviance residual with bad values on $b_bad"; { eval { effect_code(['a']) }; isnt $@, '', 'effect_code with only one value dies'; my $a = scalar effect_code([qw(a a a b b b b c c BAD)]); is_pdl $a, pdl('1 1 1 0 0 0 0 -1 -1 BAD; 0 0 0 1 1 1 1 -1 -1 BAD'), 'effect_code coded with bad value'; } { eval { effect_code_w(['a']) }; isnt $@, '', 'effect_code_w with only one value dies'; is_pdl scalar effect_code_w([qw(a a a b b b b c c c)]), pdl ' 1 1 1 0 0 0 0 -1 -1 -1; 0 0 0 1 1 1 1 -1.3333333 -1.3333333 -1.3333333 '; } { # anova 3 way my $d = sequence 60; my @a = (('a')x 15, ('b')x 15, ('c')x 15, ('d')x 15); my $b = $d % 3; my $c = $d % 2; $d->set( 20, 10 ); my @idv = qw(A B C); my %m = $d->anova(\@a, $b, $c, {IVNM=>\@idv, plot=>0}); $m{'| A ~ B ~ C | m'} = $m{'| A ~ B ~ C | m'}->slice(',(2),'); test_stats_cmp(\%m, { '| A | F' => 165.252100840336, '| A ~ B ~ C | F' => 0.0756302521008415, '| A ~ B ~ C | m' => pdl([[qw(8 18 38 53)], [qw(8 23 38 53)]]), }); my $dsgn = $d->anova_design_matrix(undef, \@a, $b, $c, {IVNM=>\@idv}); is_pdl $dsgn, pdl ' [1 1 0 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0] [1 1 0 0 0 1 -1 0 0 0 1 0 0 -1 0 0 0 -1 0 0 0 -1 0 0] [1 1 0 0 -1 -1 1 -1 0 0 -1 0 0 1 0 0 -1 -1 -1 0 0 -1 0 0] [1 1 0 0 1 0 -1 1 0 0 0 0 0 -1 0 0 -1 0 -1 0 0 0 0 0] [1 1 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0] [1 1 0 0 -1 -1 -1 -1 0 0 -1 0 0 -1 0 0 1 1 1 0 0 1 0 0] [1 1 0 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0] [1 1 0 0 0 1 -1 0 0 0 1 0 0 -1 0 0 0 -1 0 0 0 -1 0 0] [1 1 0 0 -1 -1 1 -1 0 0 -1 0 0 1 0 0 -1 -1 -1 0 0 -1 0 0] [1 1 0 0 1 0 -1 1 0 0 0 0 0 -1 0 0 -1 0 -1 0 0 0 0 0] [1 1 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 0] [1 1 0 0 -1 -1 -1 -1 0 0 -1 0 0 -1 0 0 1 1 1 0 0 1 0 0] [1 1 0 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0] [1 1 0 0 0 1 -1 0 0 0 1 0 0 -1 0 0 0 -1 0 0 0 -1 0 0] [1 1 0 0 -1 -1 1 -1 0 0 -1 0 0 1 0 0 -1 -1 -1 0 0 -1 0 0] [1 0 1 0 1 0 -1 0 1 0 0 0 0 0 -1 0 -1 0 0 -1 0 0 0 0] [1 0 1 0 0 1 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0] [1 0 1 0 -1 -1 -1 0 -1 0 0 -1 0 0 -1 0 1 1 0 1 0 0 1 0] [1 0 1 0 1 0 1 0 1 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0] [1 0 1 0 0 1 -1 0 0 0 0 1 0 0 -1 0 0 -1 0 0 0 0 -1 0] [1 0 1 0 -1 -1 1 0 -1 0 0 -1 0 0 1 0 -1 -1 0 -1 0 0 -1 0] [1 0 1 0 1 0 -1 0 1 0 0 0 0 0 -1 0 -1 0 0 -1 0 0 0 0] [1 0 1 0 0 1 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0] [1 0 1 0 -1 -1 -1 0 -1 0 0 -1 0 0 -1 0 1 1 0 1 0 0 1 0] [1 0 1 0 1 0 1 0 1 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0] [1 0 1 0 0 1 -1 0 0 0 0 1 0 0 -1 0 0 -1 0 0 0 0 -1 0] [1 0 1 0 -1 -1 1 0 -1 0 0 -1 0 0 1 0 -1 -1 0 -1 0 0 -1 0] [1 0 1 0 1 0 -1 0 1 0 0 0 0 0 -1 0 -1 0 0 -1 0 0 0 0] [1 0 1 0 0 1 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0] [1 0 1 0 -1 -1 -1 0 -1 0 0 -1 0 0 -1 0 1 1 0 1 0 0 1 0] [1 0 0 1 1 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0] [1 0 0 1 0 1 -1 0 0 0 0 0 1 0 0 -1 0 -1 0 0 0 0 0 -1] [1 0 0 1 -1 -1 1 0 0 -1 0 0 -1 0 0 1 -1 -1 0 0 -1 0 0 -1] [1 0 0 1 1 0 -1 0 0 1 0 0 0 0 0 -1 -1 0 0 0 -1 0 0 0] [1 0 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 1] [1 0 0 1 -1 -1 -1 0 0 -1 0 0 -1 0 0 -1 1 1 0 0 1 0 0 1] [1 0 0 1 1 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0] [1 0 0 1 0 1 -1 0 0 0 0 0 1 0 0 -1 0 -1 0 0 0 0 0 -1] [1 0 0 1 -1 -1 1 0 0 -1 0 0 -1 0 0 1 -1 -1 0 0 -1 0 0 -1] [1 0 0 1 1 0 -1 0 0 1 0 0 0 0 0 -1 -1 0 0 0 -1 0 0 0] [1 0 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 1] [1 0 0 1 -1 -1 -1 0 0 -1 0 0 -1 0 0 -1 1 1 0 0 1 0 0 1] [1 0 0 1 1 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0] [1 0 0 1 0 1 -1 0 0 0 0 0 1 0 0 -1 0 -1 0 0 0 0 0 -1] [1 0 0 1 -1 -1 1 0 0 -1 0 0 -1 0 0 1 -1 -1 0 0 -1 0 0 -1] [1 -1 -1 -1 1 0 -1 -1 -1 -1 0 0 0 1 1 1 -1 0 1 1 1 0 0 0] [1 -1 -1 -1 0 1 1 0 0 0 -1 -1 -1 -1 -1 -1 0 1 0 0 0 -1 -1 -1] [1 -1 -1 -1 -1 -1 -1 1 1 1 1 1 1 1 1 1 1 1 -1 -1 -1 -1 -1 -1] [1 -1 -1 -1 1 0 1 -1 -1 -1 0 0 0 -1 -1 -1 1 0 -1 -1 -1 0 0 0] [1 -1 -1 -1 0 1 -1 0 0 0 -1 -1 -1 1 1 1 0 -1 0 0 0 1 1 1] [1 -1 -1 -1 -1 -1 1 1 1 1 1 1 1 -1 -1 -1 -1 -1 1 1 1 1 1 1] [1 -1 -1 -1 1 0 -1 -1 -1 -1 0 0 0 1 1 1 -1 0 1 1 1 0 0 0] [1 -1 -1 -1 0 1 1 0 0 0 -1 -1 -1 -1 -1 -1 0 1 0 0 0 -1 -1 -1] [1 -1 -1 -1 -1 -1 -1 1 1 1 1 1 1 1 1 1 1 1 -1 -1 -1 -1 -1 -1] [1 -1 -1 -1 1 0 1 -1 -1 -1 0 0 0 -1 -1 -1 1 0 -1 -1 -1 0 0 0] [1 -1 -1 -1 0 1 -1 0 0 0 -1 -1 -1 1 1 1 0 -1 0 0 0 1 1 1] [1 -1 -1 -1 -1 -1 1 1 1 1 1 1 1 -1 -1 -1 -1 -1 1 1 1 1 1 1] [1 -1 -1 -1 1 0 -1 -1 -1 -1 0 0 0 1 1 1 -1 0 1 1 1 0 0 0] [1 -1 -1 -1 0 1 1 0 0 0 -1 -1 -1 -1 -1 -1 0 1 0 0 0 -1 -1 -1] [1 -1 -1 -1 -1 -1 -1 1 1 1 1 1 1 1 1 1 1 1 -1 -1 -1 -1 -1 -1] '; } { # anova with too few samples for experiment (3*2*2 categories, 12 samples) my $y = pdl '[1 1 2 2 3 3 3 3 4 5 5 5]'; # ratings for 12 apples my $a = sequence(12) % 3 + 1; # IV for types of apple my @b = qw( y y y y y y n n n n n n ); # IV for whether we baked the apple my @c = qw( r g r g r g r g r g r g ); # IV for apple colour (red/green) eval {$y->anova( $a, \@b, \@c, { IVNM=>[qw(apple bake colour)], PLOT=>0 } )}; like $@, qr/residual df = 0/, 'error when too few sample'; } { # anova 1 way my $d = pdl qw( 3 2 1 5 2 1 5 3 1 4 1 2 3 5 5 ); my $a = qsort sequence(15) % 3; my %m = $d->anova($a, {plot=>0}); test_stats_cmp(\%m, { F => 0.160919540229886, ms_model => 0.466666666666669, '| IV_0 | m' => pdl(qw( 2.6 2.8 3.2 )), }); } { # anova_3w bad dv my $d = sequence 60; $d->set( 20, 10 ); $d->setbadat(1); $d->setbadat(10); my @a = (('a')x 15, ('b')x 15, ('c')x 15, ('d')x 15); my $b = sequence(60) % 3; my $c = sequence(60) % 2; my %m = $d->anova(\@a, $b, $c, {IVNM=>[qw(A B C)], plot=>0, v=>0}); $m{$_} = $m{$_}->slice(',(1)') for '| A ~ B ~ C | m', '| A ~ B ~ C | se'; test_stats_cmp(\%m, { '| A | F' => 150.00306433446, '| A ~ B ~ C | F' => 0.17534855325553, '| A ~ B ~ C | m' => pdl([qw( 4 22 37 52 )], [qw( 10 22 37 52 )]), '| A ~ B ~ C | se' => pdl([qw( 0 6 1.7320508 3.4641016 )], [qw( 3 3 3.4641016 1.7320508 )]), }); } { # anova_3w bad dv iv my $d = sequence 63; my @a = (('a')x 15, ('b')x 15, ('c')x 15, ('d')x 15); push @a, undef, qw( b c ); my $b = $d % 3; my $c = $d % 2; $d->set( 20, 10 ); $d->setbadat(62); $b->setbadat(61); my %m = $d->anova(\@a, $b, $c, {IVNM=>[qw(A B C)], plot=>0, V=>0}); $m{$_} = $m{$_}->slice(',(2)') for '| A ~ B ~ C | m'; test_stats_cmp(\%m, { '| A | F' => 165.252100840336, '| A ~ B ~ C | F' => 0.0756302521008415, '| A ~ B ~ C | m' => pdl([qw(8 18 38 53)], [qw(8 23 38 53)]), }); } { # anova_nist_low # data from https://www.itl.nist.gov/div898/strd/anova/SmLs01.html # 1 2 3 4 5 6 7 8 9 # 1.4 1.3 1.5 1.3 1.5 1.3 1.5 1.3 1.5 # 1.3 1.2 1.4 1.2 1.4 1.2 1.4 1.2 1.4 # 1.5 1.4 1.6 1.4 1.6 1.4 1.6 1.4 1.6 # 1.3 1.2 1.4 1.2 1.4 1.2 1.4 1.2 1.4 # 1.5 1.4 1.6 1.4 1.6 1.4 1.6 1.4 1.6 # 1.3 1.2 1.4 1.2 1.4 1.2 1.4 1.2 1.4 # 1.5 1.4 1.6 1.4 1.6 1.4 1.6 1.4 1.6 # 1.3 1.2 1.4 1.2 1.4 1.2 1.4 1.2 1.4 # 1.5 1.4 1.6 1.4 1.6 1.4 1.6 1.4 1.6 # 1.3 1.2 1.4 1.2 1.4 1.2 1.4 1.2 1.4 # 1.5 1.4 1.6 1.4 1.6 1.4 1.6 1.4 1.6 # 1.3 1.2 1.4 1.2 1.4 1.2 1.4 1.2 1.4 # 1.5 1.4 1.6 1.4 1.6 1.4 1.6 1.4 1.6 # 1.3 1.2 1.4 1.2 1.4 1.2 1.4 1.2 1.4 # 1.5 1.4 1.6 1.4 1.6 1.4 1.6 1.4 1.6 # 1.3 1.2 1.4 1.2 1.4 1.2 1.4 1.2 1.4 # 1.5 1.4 1.6 1.4 1.6 1.4 1.6 1.4 1.6 # 1.3 1.2 1.4 1.2 1.4 1.2 1.4 1.2 1.4 # 1.5 1.4 1.6 1.4 1.6 1.4 1.6 1.4 1.6 # 1.3 1.2 1.4 1.2 1.4 1.2 1.4 1.2 1.4 # 1.5 1.4 1.6 1.4 1.6 1.4 1.6 1.4 1.6 # Certified Values: # Source of Sums of Mean # Variation df Squares Squares F Statistic # Between Treatment 8 1.68000000000000E+00 2.10000000000000E-01 2.10000000000000E+01 # Within Treatment 180 1.80000000000000E+00 1.00000000000000E-02 # Certified R-Squared 4.82758620689655E-01 # Certified Residual # Standard Deviation 1.00000000000000E-01 my $data = pdl('[ [1.4 1.3 1.5 1.3 1.5 1.3 1.5 1.3 1.5] [1.3 1.2 1.4 1.2 1.4 1.2 1.4 1.2 1.4] [1.5 1.4 1.6 1.4 1.6 1.4 1.6 1.4 1.6] [1.3 1.2 1.4 1.2 1.4 1.2 1.4 1.2 1.4] [1.5 1.4 1.6 1.4 1.6 1.4 1.6 1.4 1.6] [1.3 1.2 1.4 1.2 1.4 1.2 1.4 1.2 1.4] [1.5 1.4 1.6 1.4 1.6 1.4 1.6 1.4 1.6] [1.3 1.2 1.4 1.2 1.4 1.2 1.4 1.2 1.4] [1.5 1.4 1.6 1.4 1.6 1.4 1.6 1.4 1.6] [1.3 1.2 1.4 1.2 1.4 1.2 1.4 1.2 1.4] [1.5 1.4 1.6 1.4 1.6 1.4 1.6 1.4 1.6] [1.3 1.2 1.4 1.2 1.4 1.2 1.4 1.2 1.4] [1.5 1.4 1.6 1.4 1.6 1.4 1.6 1.4 1.6] [1.3 1.2 1.4 1.2 1.4 1.2 1.4 1.2 1.4] [1.5 1.4 1.6 1.4 1.6 1.4 1.6 1.4 1.6] [1.3 1.2 1.4 1.2 1.4 1.2 1.4 1.2 1.4] [1.5 1.4 1.6 1.4 1.6 1.4 1.6 1.4 1.6] [1.3 1.2 1.4 1.2 1.4 1.2 1.4 1.2 1.4] [1.5 1.4 1.6 1.4 1.6 1.4 1.6 1.4 1.6] [1.3 1.2 1.4 1.2 1.4 1.2 1.4 1.2 1.4] [1.5 1.4 1.6 1.4 1.6 1.4 1.6 1.4 1.6] ]')->flat; my %m = $data->anova(my $iv = sequence(9)->dummy(1,21)->flat); test_stats_cmp(\%m, { '| IV_0 | ms' => 0.21, '| IV_0 | ss' => 1.68, F => 21, F_df => pdl(8, 180), ss_model => 1.68, ss_residual => 1.8, ms_residual => 0.01, }); } is_pdl pdl('BAD 1 2 3 4; BAD BAD BAD BAD BAD')->fill_m, pdl('2.5 1 2 3 4; 0 0 0 0 0'), 'fill_m nan to bad'; is_pdl pdl([1,1,1], [2,2,2])->stddz, zeroes(3,2), 'stddz nan vs bad'; { # data from https://www.youtube.com/watch?v=Fh73dAOMm9M # Person,Before,After 2 weeks,After 4 weeks # P1,102,97,95 # P2,79,77,75 # P3,83,77,75 # P4,92,93,87 # library(data.table) # library(rstatix) # tdata <- data.frame( # stringsAsFactors = FALSE, # dv = c(102.0,97.0,95.0,79.0,77.0,75.0,83.0,77.0,75.0,92.0,93.0,87.0), # id = c(0L,0L,0L,1L,1L,1L,2L,2L,2L,3L,3L,3L), # wk = c(0L,2L,4L,0L,2L,4L,0L,2L,4L,0L,2L,4L) # ) # as.data.table(tdata) # res.aov <- anova_test( # data = tdata, dv = dv, wid = id, # within = c(wk), detailed = TRUE # ) # res.aov # get_anova_table(res.aov, correction = "none") # ANOVA Table (type III tests) # Effect DFn DFd SSn SSd F p p<.05 ges # 1 (Intercept) 1 3 88752 916.667 290.461 0.00044 * 0.990 # 2 wk 2 6 72 17.333 12.462 0.00700 * 0.072 # turned into format for anova_rptd, then ($data, $idv, $subj) = rtable 'diet.txt', {SEP=>','} # Person,Week,Weight # P1,0,102 # P1,2,97 # P1,4,95 # P2,0,79 # P2,2,77 # P2,4,75 # P3,0,83 # P3,2,77 # P3,4,75 # P4,0,92 # P4,2,93 # P4,4,87 my ($data, $ivnm, $subj) = ( pdl( q[ [ 0 2 4 0 2 4 0 2 4 0 2 4] [102 97 95 79 77 75 83 77 75 92 93 87] ] ), [ qw(Week) ], [ qw(P1 P1 P1 P2 P2 P2 P3 P3 P3 P4 P4 P4) ], ); my ($w, $dv) = $data->dog; my %m = $dv->anova_rptd($subj, $w, {ivnm=>$ivnm}); test_stats_cmp(\%m, { '| Week | F' => 12.4615384615385, '| Week | df' => 2, '| Week | ms' => 36, '| Week | ss' => 72, ss_subject => 916.666666, }); } { # anova_rptd_1w my $d = pdl qw( 3 2 1 5 2 1 5 3 1 4 1 2 3 5 5 ); my $s = sequence(5)->dummy(1,3)->flat; my $a = qsort sequence(15) % 3; my $dsgn = $d->anova_design_matrix($s, $a, {plot=>0}); is_pdl $dsgn, pdl ' [1 1 0 1 0 0 0] [1 1 0 0 1 0 0] [1 1 0 0 0 1 0] [1 1 0 0 0 0 1] [1 1 0 -1 -1 -1 -1] [1 0 1 1 0 0 0] [1 0 1 0 1 0 0] [1 0 1 0 0 1 0] [1 0 1 0 0 0 1] [1 0 1 -1 -1 -1 -1] [1 -1 -1 1 0 0 0] [1 -1 -1 0 1 0 0] [1 -1 -1 0 0 1 0] [1 -1 -1 0 0 0 1] [1 -1 -1 -1 -1 -1 -1] '; my %m = $d->anova_rptd($s, $a, {plot=>0}); test_stats_cmp(\%m, { '| IV_0 | F' => 0.145077720207254, '| IV_0 | ms' => 0.466666666666667, '| IV_0 | m' => pdl(qw( 2.6 2.8 3.2 )), }); } my %anova_bad_a = ( '| a | F' => 0.351351351351351, '| a | ms' => 0.722222222222222, '| a ~ b | F' => 5.25, '| a ~ b | m' => pdl(qw( 3 1.3333333 3.3333333 3.3333333 3.6666667 2.6666667 ))->reshape(3,2), ); { # anova_rptd_2w bad dv my $d = pdl '[3 2 1 5 2 BAD 5 3 1 4 1 2 3 5 5 3 4 2 1 5 4 3 2 2]'; my $s = pdl '[0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3]'; my $a = pdl '[0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2]'; my $b = pdl '[0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1]'; my $dsgn = $d->anova_design_matrix($s, $a, $b, {v=>0}); is_pdl $dsgn, pdl ' [ 1 1 0 1 1 0 1 0 0 0 1 0 1 0] [ 1 1 0 1 1 0 0 0 1 0 0 1 0 1] [ 1 1 0 1 1 0 -1 0 -1 0 -1 -1 -1 -1] [ 1 1 0 -1 -1 0 1 0 0 0 -1 0 1 0] [ 1 1 0 -1 -1 0 0 0 1 0 0 -1 0 1] [ 1 1 0 -1 -1 0 -1 0 -1 0 1 1 -1 -1] [ 1 0 1 1 0 1 0 1 0 0 1 0 1 0] [ 1 0 1 1 0 1 0 0 0 1 0 1 0 1] [ 1 0 1 1 0 1 0 -1 0 -1 -1 -1 -1 -1] [ 1 0 1 -1 0 -1 0 1 0 0 -1 0 1 0] [ 1 0 1 -1 0 -1 0 0 0 1 0 -1 0 1] [ 1 0 1 -1 0 -1 0 -1 0 -1 1 1 -1 -1] [ 1 -1 -1 1 -1 -1 -1 -1 0 0 1 0 1 0] [ 1 -1 -1 1 -1 -1 0 0 -1 -1 0 1 0 1] [ 1 -1 -1 1 -1 -1 1 1 1 1 -1 -1 -1 -1] [ 1 -1 -1 -1 1 1 -1 -1 0 0 -1 0 1 0] [ 1 -1 -1 -1 1 1 0 0 -1 -1 0 -1 0 1] [ 1 -1 -1 -1 1 1 1 1 1 1 1 1 -1 -1] '; my %m = $d->anova_rptd($s, $a, $b, {ivnm=>['a','b'],plot=>0, v=>0}); test_stats_cmp(\%m, \%anova_bad_a); } { # anova_rptd_2w bad iv my $d = pdl '[3 2 1 5 2 1 5 3 1 4 1 2 3 5 5 3 4 2 1 5 4 3 2 2]'; my $s = pdl '[0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3]'; my $a = pdl '[0 0 0 0 0 BAD 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2]'; my $b = pdl '[0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1]'; my %m = $d->anova_rptd($s, $a, $b, {ivnm=>['a','b'],plot=>0, v=>0}); test_stats_cmp(\%m, \%anova_bad_a); } { # anova_rptd_3w my $d = pdl( qw( 3 2 1 5 2 1 5 3 1 4 1 2 3 5 5 3 4 2 1 5 4 3 2 2 ), qw( 5 5 1 1 4 4 1 4 4 2 3 3 5 1 1 2 4 4 4 5 5 1 1 2 ) ); my $s = sequence(4)->dummy(0,12)->flat; my $a = sequence(2)->dummy(0,6)->flat->dummy(1,4)->flat; my $b = sequence(2)->dummy(0,3)->flat->dummy(1,8)->flat; my $c = sequence(3)->dummy(1,16)->flat; my %m = $d->anova_rptd($s, $a, $b, $c, {ivnm=>['a','b', 'c'],plot=>0}); test_stats_cmp(\%m, { '| a | F' => 0.572519083969459, '| a | ms' => 0.520833333333327, '| a ~ c | F' => 3.64615384615385, '| b ~ c || err ms' => 2.63194444444445, '| a ~ b ~ c | F' => 1.71299093655589, '| a ~ b ~ c | m' => pdl(qw( 4 2.75 2.75 2.5 3.25 4.25 3.5 1.75 2 3.5 2.75 2.25 ))->reshape(2,2,3), '| a ~ b | se' => ones(2, 2) * 0.55014729, }); } sub test_stats_cmp { local $Test::Builder::Level = $Test::Builder::Level + 1; my ($m, $ans, $eps) = @_; $eps ||= 1e-6; foreach (sort keys %$ans) { die "No '$_' value received" if !exists $m->{$_}; my $got = PDL->topdl($m->{$_}); my $exp = $ans->{$_}; if (ref $exp eq 'ARRAY') { ($exp, my $func) = @$exp; ($got, $exp) = map &$func($_), $got, $exp; } is_pdl $got, PDL->topdl($exp), {atol=>$eps, require_equal_types=>0, test_name=>$_}; } } # Tests for mixed anova thanks to Erich Greene my %anova_ans_l2_common = ( '| within | df' => 2, '| within || err df' => 12, '| within | ss' => .25, '| within | ms' => .125, '| within || err ss' => 23.666667, '| within || err ms' => 1.9722222, '| within | F' => 0.063380282, '| between | df' => 1, '| between || err df' => 6, '| between | ss' => 2.0416667, '| between | ms' => 2.0416667, '| between || err ss' => 16.583333, '| between || err ms' => 2.7638889, '| between | F' => 0.73869347, '| within ~ between | df' => 2, '| within ~ between | ss' => 6.0833333, '| within ~ between | ms' => 3.0416667, '| within ~ between | F' => 1.5422535, ); $anova_ans_l2_common{"| within ~ between || err $_"} = $anova_ans_l2_common{"| within || err $_"} foreach qw/df ss ms/; { # anova_rptd mixed with 2 btwn-subj var levels, data grouped by within var my $d = pdl '[3 2 1 5 2 1 5 3 1 4 1 2 3 5 5 3 4 2 1 5 4 3 2 2]'; my $s = pdl '[0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7]'; my $w = pdl '[0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2]'; my $b = pdl '[0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1]'; my %m = $d->anova_rptd($s,$w,$b,{ivnm=>['within','between'],btwn=>[1],plot=>0, v=>0}); test_stats_cmp(\%m, \%anova_ans_l2_common); } { # anova_rptd mixed with 2 btwn-subj var levels, data grouped by subject my $d = pdl '[3 1 4 2 4 2 1 1 1 5 2 5 2 3 4 1 5 3 5 5 2 3 3 2]'; my $s = pdl '[0 0 0 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 7 7 7]'; my $w = pdl '[0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2]'; my $b = pdl '[0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1]'; my @idv = qw(within between); my %m = $d->anova_rptd($s,$w,$b,{ivnm=>\@idv,btwn=>[1],plot=>0, v=>0}); test_stats_cmp(\%m, \%anova_ans_l2_common); my $dsgn = $d->anova_design_matrix($s,$w,$b,{ivnm=>\@idv,btwn=>[1],v=>0}); is_pdl $dsgn, pdl ' [1 1 0 1 1 0 1 0 0 0 0 0] [1 0 1 1 0 1 1 0 0 0 0 0] [1 -1 -1 1 -1 -1 1 0 0 0 0 0] [1 1 0 1 1 0 0 1 0 0 0 0] [1 0 1 1 0 1 0 1 0 0 0 0] [1 -1 -1 1 -1 -1 0 1 0 0 0 0] [1 1 0 1 1 0 0 0 1 0 0 0] [1 0 1 1 0 1 0 0 1 0 0 0] [1 -1 -1 1 -1 -1 0 0 1 0 0 0] [1 1 0 1 1 0 -1 -1 -1 0 0 0] [1 0 1 1 0 1 -1 -1 -1 0 0 0] [1 -1 -1 1 -1 -1 -1 -1 -1 0 0 0] [1 1 0 -1 -1 0 0 0 0 1 0 0] [1 0 1 -1 0 -1 0 0 0 1 0 0] [1 -1 -1 -1 1 1 0 0 0 1 0 0] [1 1 0 -1 -1 0 0 0 0 0 1 0] [1 0 1 -1 0 -1 0 0 0 0 1 0] [1 -1 -1 -1 1 1 0 0 0 0 1 0] [1 1 0 -1 -1 0 0 0 0 0 0 1] [1 0 1 -1 0 -1 0 0 0 0 0 1] [1 -1 -1 -1 1 1 0 0 0 0 0 1] [1 1 0 -1 -1 0 0 0 0 -1 -1 -1] [1 0 1 -1 0 -1 0 0 0 -1 -1 -1] [1 -1 -1 -1 1 1 0 0 0 -1 -1 -1] '; } my %anova_ans_l3_common = ( '| within | df' => 2, '| within || err df' => 12, '| within | ss' => .962962, '| within | ms' => .481481, '| within || err ss' => 20.888888, '| within || err ms' => 1.740740, '| within | F' => .276596, '| between | df' => 2, '| between || err df' => 6, '| between | ss' => 1.185185, '| between | ms' => .592592, '| between || err ss' => 13.111111, '| between || err ms' => 2.185185, '| between | F' => .271186, '| between ~ within | df' => 4, '| between ~ within | ss' => 4.148148, '| between ~ within | ms' => 1.037037, '| between ~ within | F' => .595744, ); $anova_ans_l3_common{"| between ~ within || err $_"} = $anova_ans_l3_common{"| within || err $_"} foreach qw/df ss ms/; { # anova_rptd mixed with 3 btwn-subj var levels, data grouped by within var my $d = pdl '[5 2 2 5 4 1 5 3 5 4 4 3 4 3 4 3 5 1 4 3 3 4 5 4 5 5 2]'; my $s = pdl '[0 1 2 3 4 5 6 7 8 0 1 2 3 4 5 6 7 8 0 1 2 3 4 5 6 7 8]'; my $w = pdl '[0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2]'; my $b = pdl '[0 0 0 1 1 1 2 2 2 0 0 0 1 1 1 2 2 2 0 0 0 1 1 1 2 2 2]'; my @idv = qw(between within); my %m = $d->anova_rptd($s,$b,$w,{ivnm=>\@idv,btwn=>[0],plot=>0, v=>0}); test_stats_cmp(\%m, \%anova_ans_l3_common); } { # anova_rptd mixed with 3 btwn-subj var levels, data grouped by subject my $d = pdl '[5 4 4 2 4 3 2 3 3 5 4 4 4 3 5 1 4 4 5 3 5 3 5 5 5 1 2]'; my $s = pdl '[0 0 0 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 6 6 6 7 7 7 8 8 8]'; my $w = pdl '[0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2 0 1 2]'; my $b = pdl '[0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2]'; my @idv = qw(between within); my %m = $d->anova_rptd($s,$b,$w,{ivnm=>\@idv,btwn=>[0],plot=>0, v=>0}); test_stats_cmp(\%m, \%anova_ans_l3_common); } { # from Rutherford (2011) p200, mixed anova my $d = pdl '7 7 8 16 16 24 3 11 14 7 10 29 6 9 10 11 13 10 6 11 11 9 10 22 5 10 12 10 10 25 8 10 10 11 14 28 6 11 11 8 11 22 7 11 12 8 12 24'; my $s = pdl '1 1 1 9 9 9 2 2 2 10 10 10 3 3 3 11 11 11 4 4 4 12 12 12 5 5 5 13 13 13 6 6 6 14 14 14 7 7 7 15 15 15 8 8 8 16 16 16'; my $w = pdl '1 2 3 1 2 3 1 2 3 1 2 3 1 2 3 1 2 3 1 2 3 1 2 3 1 2 3 1 2 3 1 2 3 1 2 3 1 2 3 1 2 3 1 2 3 1 2 3'; my $b = pdl '1 1 1 2 2 2 1 1 1 2 2 2 1 1 1 2 2 2 1 1 1 2 2 2 1 1 1 2 2 2 1 1 1 2 2 2 1 1 1 2 2 2 1 1 1 2 2 2'; my $exp = { '| time | F' => 37.2348284960422, '| time | ms' => 336, '| instructions ~ time | F' => 12.4116094986807, '| instructions | F' => 47.4973821989529, }; my @idv = qw(instructions time); my %m = $d->anova_rptd($s,$b,$w,{ivnm=>\@idv,btwn=>[0],plot=>0, v=>0}); test_stats_cmp(\%m, $exp); my $inds_by_i_t_subj = PDL::glue(0, map $_->t, $b, $w, $s)->qsortveci; $_ = $_->index($inds_by_i_t_subj) for $d, $s, $b, $w; my $dsgn = $d->anova_design_matrix($s,$b,$w,{ivnm=>\@idv,btwn=>[0],v=>0}); is_pdl $dsgn, pdl ' [1 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0] [1 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0] [1 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0] [1 1 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0] [1 1 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0] [1 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0] [1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0] [1 1 1 0 1 0 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 0 0 0] [1 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0] [1 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0] [1 1 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0] [1 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0] [1 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0] [1 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0] [1 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0] [1 1 0 1 0 1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 0 0 0] [1 1 -1 -1 -1 -1 1 0 0 0 0 0 0 0 0 0 0 0 0 0] [1 1 -1 -1 -1 -1 0 1 0 0 0 0 0 0 0 0 0 0 0 0] [1 1 -1 -1 -1 -1 0 0 1 0 0 0 0 0 0 0 0 0 0 0] [1 1 -1 -1 -1 -1 0 0 0 1 0 0 0 0 0 0 0 0 0 0] [1 1 -1 -1 -1 -1 0 0 0 0 1 0 0 0 0 0 0 0 0 0] [1 1 -1 -1 -1 -1 0 0 0 0 0 1 0 0 0 0 0 0 0 0] [1 1 -1 -1 -1 -1 0 0 0 0 0 0 1 0 0 0 0 0 0 0] [1 1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 0 0 0] [1 -1 1 0 -1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0] [1 -1 1 0 -1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0] [1 -1 1 0 -1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0] [1 -1 1 0 -1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0] [1 -1 1 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0] [1 -1 1 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0] [1 -1 1 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1] [1 -1 1 0 -1 0 0 0 0 0 0 0 0 -1 -1 -1 -1 -1 -1 -1] [1 -1 0 1 0 -1 0 0 0 0 0 0 0 1 0 0 0 0 0 0] [1 -1 0 1 0 -1 0 0 0 0 0 0 0 0 1 0 0 0 0 0] [1 -1 0 1 0 -1 0 0 0 0 0 0 0 0 0 1 0 0 0 0] [1 -1 0 1 0 -1 0 0 0 0 0 0 0 0 0 0 1 0 0 0] [1 -1 0 1 0 -1 0 0 0 0 0 0 0 0 0 0 0 1 0 0] [1 -1 0 1 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 1 0] [1 -1 0 1 0 -1 0 0 0 0 0 0 0 0 0 0 0 0 0 1] [1 -1 0 1 0 -1 0 0 0 0 0 0 0 -1 -1 -1 -1 -1 -1 -1] [1 -1 -1 -1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0] [1 -1 -1 -1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0] [1 -1 -1 -1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0] [1 -1 -1 -1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0] [1 -1 -1 -1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0] [1 -1 -1 -1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0] [1 -1 -1 -1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1] [1 -1 -1 -1 1 1 0 0 0 0 0 0 0 -1 -1 -1 -1 -1 -1 -1] '; %m = $d->anova_rptd($s,$b,$w,{ivnm=>\@idv,btwn=>[0],plot=>0, v=>0}); test_stats_cmp(\%m, $exp); } my %ans_mixed = ( '| a | F' => 0.0633802816901399, '| a | ms' => 0.125, '| a ~ b | F' => 1.54225352112676, '| b | F' => 0.738693467336681, '| b || err ms' => 2.76388888888889, '| a ~ b | se' => ones(3,2) * 0.70217915, ); { # anova_rptd mixed my $d = pdl '[3 2 1 5 2 1 5 3 1 4 1 2 3 5 5 3 4 2 1 5 4 3 2 2]'; my $s = pdl '[0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3]'; my $a = pdl '[0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2]'; my $b = pdl '[0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1]'; my %m = $d->anova_rptd($s, $a, $b, {ivnm=>['a','b'],btwn=>[1],plot=>0, v=>0}); test_stats_cmp(\%m, \%ans_mixed); } { # anova_rptd mixed bad # with the "bad" ie removed subject and data removed, in R: # library(data.table) # library(rstatix) # tdata <- data.frame( # stringsAsFactors = FALSE, # dv = c(3.0,2.0,1.0,5.0,2.0,1.0,5.0,3.0,1.0,4.0,1.0,2.0,3.0,5.0,5.0,3.0,4.0,2.0,1.0,5.0,4.0,3.0,2.0,2), # id = c(0L,1L,2L,3L,0L,1L,2L,3L,0L,1L,2L,3L,0L,1L,2L,3L,0L,1L,2L,3L,0L,1L,2L,3L), # w = c(0L,0L,0L,0L,0L,0L,0L,0L,1L,1L,1L,1L,1L,1L,1L,1L,2L,2L,2L,2L,2L,2L,2L,2L), # b = c(0L,0L,0L,0L,1L,1L,1L,1L,0L,0L,0L,0L,1L,1L,1L,1L,0L,0L,0L,0L,1L,1L,1L,1L) # ) # as.data.table(tdata) # tdata <- tdata %>% convert_as_factor(id, w, b) # as.data.table(tdata) # res.aov <- anova_test( # data = tdata, dv = dv, wid = id, # within = c(w), between = c(b), detailed = TRUE # ) # get_anova_table(res.aov, correction = "none") # Effect DFn DFd SSn SSd F p p<.05 ges # 1 (Intercept) 1 6 198.375 16.583 71.774 0.000148 * 0.831 # 2 b 1 6 2.042 16.583 0.739 0.423000 0.048 # 3 w 2 12 0.250 23.667 0.063 0.939000 0.006 # 4 b:w 2 12 6.083 23.667 1.542 0.253000 0.131 my $d = pdl '[3 2 1 5 2 1 5 3 1 4 1 2 3 5 5 3 4 2 1 5 4 3 2 2 1 1 1 1]'; my $s = pdl '[0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 0 1 2 3 4 4 4 4]'; my $a = pdl '[0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 0 0 0 0]'; my $b = pdl '[0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 0 0 0 BAD]'; # any missing value causes all data from the subject (4) to be dropped my %m = $d->anova_rptd($s, $a, $b, {ivnm=>['a','b'],btwn=>[1],plot=>0, v=>0}); test_stats_cmp(\%m, \%ans_mixed); } { # anova_rptd_mixed_4w my ($data, $idv, $subj) = rtable \*DATA, {v=>0}; my ($age, $aa, $beer, $wings, $dv) = $data->dog; my %m = $dv->anova_rptd( $subj, $age, $aa, $beer, $wings, { ivnm=>[qw(age aa beer wings)], btwn=>[0,1], v=>0, plot=>0 } ); test_stats_cmp(\%m, { '| aa | F' => 0.0829493087557666, '| age ~ aa | F' => 2.3594470046083, '| beer | F' => 0.00943396226415362, '| aa ~ beer | F' => 0.235849056603778, '| age ~ beer ~ wings | F' => 0.0303030303030338, '| beer ~ wings | F' => 2.73484848484849, '| age ~ aa ~ beer ~ wings | F' => 3.03030303030303, }); } { my $a = effect_code( sequence(12) > 5 ); my $b = effect_code([ (0,1)x 6 ]); my $c = effect_code([ (0,0,1,1,2,2)x 2 ]); my $ans = pdl '1 -1 0 -0 -1 1 -1 1 -0 0 1 -1; 0 -0 1 -1 -1 1 -0 0 -1 1 1 -1'; my $inter = interaction_code( $a, $b, $c); is_pdl $inter, $ans, 'interaction_code'; } done_testing(); __DATA__ subj age Apple-android beer wings recall 1 0 0 0 0 5 1 0 0 0 1 4 1 0 0 1 0 8 1 0 0 1 1 3 2 0 0 0 0 3 2 0 0 0 1 7 2 0 0 1 0 9 2 0 0 1 1 3 3 0 0 0 0 2 3 0 0 0 1 9 3 0 0 1 0 1 3 0 0 1 1 0 1 0 1 0 0 4 1 0 1 0 1 6 1 0 1 1 0 9 1 0 1 1 1 6 2 0 1 0 0 9 2 0 1 0 1 7 2 0 1 1 0 5 2 0 1 1 1 8 3 0 1 0 0 6 3 0 1 0 1 6 3 0 1 1 0 3 3 0 1 1 1 4 1 1 0 0 0 8 1 1 0 0 1 8 1 1 0 1 0 10 1 1 0 1 1 7 2 1 0 0 0 10 2 1 0 0 1 1 2 1 0 1 0 8 2 1 0 1 1 11 3 1 0 0 0 4 3 1 0 0 1 10 3 1 0 1 0 5 3 1 0 1 1 2 1 1 1 0 0 10 1 1 1 0 1 6 1 1 1 1 0 10 1 1 1 1 1 6 2 1 1 0 0 2 2 1 1 0 1 5 2 1 1 1 0 9 2 1 1 1 1 4 3 1 1 0 0 3 3 1 1 0 1 5 3 1 1 1 0 9 3 1 1 1 1 2 PDL-Stats-0.855/t/kmeans.t0000644000175000017500000001145214754025140015107 0ustar osboxesosboxesuse strict; use warnings; use Test::More; use PDL::Stats::Basic; use PDL::Stats::Kmeans; use PDL::LiteF; use Test::PDL qw(is_pdl eq_pdl); { my $a = iv_cluster( [qw(a a b b)] ); is_pdl $a, pdl([1,1,0,0], [0,0,1,1]), 'independent variable cluster'; } is_pdl scalar iv_cluster([qw(a a BAD b b)]), pdl('1 1 BAD 0 0;0 0 BAD 1 1'), 'independent variable cluster with bad data'; is_pdl +(sequence(4,3) % 2)->assign(xvals(2,3)), short('1 0 1 0; 0 1 0 1'); { my ($m, $ss) = sequence(4,3)->centroid(byte([1,0,1,0], [0,1,0,1])); is_pdl $m, pdl([1,2], [5,6], [9,10]), "centroid"; is_pdl $ss, ones(2,3) * 2, "centroid"; } { my $centroid = pdl( [0,1], [0,1], [0,1] ); my $a = pdl '0 1 0 1 BAD; 1 0 1 0 1; 0 1 0 1 BAD'; is_pdl $a->assign($centroid), short([1,0,1,0,0], [0,1,0,1,1]), "assign with bad data"; } { my $a = pdl '0 1 2 3 BAD; 5 6 7 8 9; 10 11 12 13 BAD'; my $cluster = pdl(byte, [1,0,1,0,0], [0,1,0,1,1]); my ($m, $ss) = $a->centroid($cluster); my $m_a = pdl([1,2], [6,7.6666667], [11,12]); my $ss_a = pdl([1,1], [1,1.5555556], [1,1]); is_pdl $m, $m_a, "centroid with bad data"; is_pdl $ss, $ss_a, "centroid with bad data"; } # make kmeans deterministic: srandom(5); { my $data = pdl '0 0 2 3 4 5 6; 7 0 9 10 11 12 13; 14 0 16 17 18 19 20'; my %m = $data->kmeans({NCLUS=>2, NSEED=>6, NTRY=>10, V=>0}); is_pdl $m{centroid}->sumover, pdl('3.3333333 10.333333 17.333333'), 'kmeans'; } { my $data = pdl ' [ [0 0 2 3 4 5 6; 0 0 9 10 11 12 13; 14 0 16 17 18 19 20] [21 0 23 24 25 26 27; 28 0 30 31 32 33 34; 35 0 37 38 39 40 41] ] [ [0 0 44 45 46 47 48; 0 0 51 52 53 54 55; 56 0 58 59 60 61 62] [63 0 65 66 67 68 69; 70 0 72 73 74 75 76; 77 0 79 80 81 82 83] ] '; my %m = $data->kmeans( {nclus=>[2,1,1], ntry=>20, v=>0} ); # print "$_\t$m{$_}\n" for sort keys %m; my %a = ( 'R2' => pdl ( [ qw(0.74223245 0.97386667) ], [ qw(0.84172845 0.99499377) ], ), 'ss_sum' => pdl ( [ [ qw( 10 10 108 )], [ qw( 23.333333 23.333333 23.333333 )], ], [ [ qw( 10 10 1578 )], [ qw( 23.333333 23.333333 23.333333 )], ] ), ); is_pdl $m{R2}, $a{R2}, "kmeans R2 result as expected"; is_pdl $m{ss}->sumover, $a{ss_sum}, {atol=>1e-3, test_name=>"kmeans ss result as expected"}; } { my $data = pdl ' [ [0 0 2 3 4 5 6; 0 0 9 10 11 12 13; 14 0 16 17 18 19 20] [21 0 23 24 25 26 27; 28 0 30 31 32 33 34; 35 0 37 38 39 40 41] ] [ [0 0 44 45 46 47 48; 0 0 51 52 53 54 55; 56 0 58 59 60 61 62] [63 0 65 66 67 68 69; 70 0 72 73 74 75 76; 77 0 79 80 81 82 83] ] '; # centroid intentionally has one less dim than data my $centroid = pdl('[10 0; 10 0; 10 0] [20 0; 30 0; 30 0]'); # use dummy to match centroid dims to data dims my %m = $data->kmeans( {cntrd=>$centroid->dummy(-1), v=>0} ); # print "$_\t$m{$_}\n" for sort keys %m; my %a = ( R2 => pdl('0.74223245 0.97386667; 0.84172845 0.99499377'), ss_sum => pdl(' [10 10 108; 23.333333 23.333333 23.333333] [10 10 1578; 23.333333 23.333333 23.333333] '), ); is_pdl $m{R2}, $a{R2}, "kmeans R2 with manually seeded centroid"; is_pdl $m{ss}->sumover, $a{ss_sum}, {atol=>1e-3, test_name=>"kmeans ss with manually seeded centroid"}; } { my $data = sequence 7, 3; $data = $data->setbadat(4,0); my %m = $data->kmeans({NCLUS=>2, NTRY=>10, V=>0}); #print "$_\t$m{$_}\n" for sort keys %m; is_pdl $m{ms}->sumover, pdl('1.5 1.9166667 1.9166667'), 'kmeans bad'; } { my $data = pdl ' [0 0 2 BAD 4 5 6; 0 0 9 10 11 12 13; 0 0 16 17 18 19 20] [21 22 23 24 1 1 1; 28 29 30 31 1 1 1; 35 36 37 38 1 1 1] '; my %m = $data->kmeans( {nclus=>[2,1], ntry=>20, v=>0} ); # print "$_\t$m{$_}\n" for sort keys %m; my %a = ( 'R2' => pdl( [ qw( 0.96879592 0.99698988 ) ] ), 'ms' => pdl('[2.1875 0; 2 0; 2 0] [0,1.25; 0 1.25; 0 1.25]'), ); is_pdl $m{R2}, $a{R2}, "3d kmeans with bad data R2 is as expected"; is_pdl $m{ms}->sumover, $a{ms}->sumover, {atol=>1e-3, test_name=>"3d kmeans with bad data ss is as expected"}; } { my $l = pdl( [qw( -0.798603 -0.61624 -0.906765 0.103116)], [qw( 0.283269 -0.41041 0.131113 0.894118)], [qw( -0.419717 0.649522 -0.0223668 0.434389)], [qw( 0.325314 0.173015 -0.400108 0.0350236)], ); my $c = $l->pca_cluster({v=>0,ncomp=>4,plot=>0}); is_pdl $c, pdl([1,0,1,0], [0,1,0,0], [0,0,0,1]), "principal component analysis clustering"; } { my $a = pdl( [[3,1], [2,4]] ); my $b = pdl( [2,4], [3,1] ); my $c = pdl( 5,15 ); my $d = PDL::Stats::Kmeans::_d_point2line( $a, $b, $c ); is_pdl $d, pdl(1.754116, 1.4142136), '_d_point2line'; } { my $c0 = pdl(byte, [1,0,1,0], [0,1,0,1]); my $c1 = pdl(byte, [0,0,0,1], [0,1,1,0]); my $c = cat $c0, $c1; my $ans = indx( [0,1,0,1], [-1,1,1,0] ); is_pdl $c->which_cluster, $ans, 'which_cluster'; } done_testing(); PDL-Stats-0.855/t/basic.t0000644000175000017500000001675714762213040014724 0ustar osboxesosboxesuse strict; use warnings; use Test::More; use PDL::LiteF; use PDL::Stats::Basic; use Test::PDL; my $a = sequence 5; is_pdl $a->stdv, pdl( 1.4142135623731 ), "standard deviation of $a"; is_pdl $a->stdv_unbiased, pdl( 1.58113883008419 ), "unbiased standard deviation of $a"; is_pdl $a->var, pdl( 2 ), "variance of $a"; is_pdl $a->var_unbiased, pdl( 2.5 ), "unbiased variance of $a"; is_pdl $a->se, pdl( 0.707106781186548 ), "standard error of $a"; is_pdl $a->ss, pdl( 10 ), "sum of squared deviations from the mean of $a"; is_pdl $a->skew, pdl( 0 ), "sample skewness of $a"; is_pdl $a->skew_unbiased, pdl( 0 ), "unbiased sample skewness of $a"; is_pdl $a->kurt, pdl( -1.3 ), "sample kurtosis of $a"; is_pdl $a->kurt_unbiased, pdl( -1.2 ), "unbiased sample kurtosis of $a"; { my $x = pdl [(0.001) x 6]; my $var = $x->var; ok $var >= 0, 'var >= 0' or diag "var = $var"; my $stdv = $x->stdv; ok $stdv >= 0, 'stdv >= 0' or diag "stdv = $stdv"; } is_pdl $_->ss, (($_ - $_->avg)**2)->sumover, "ss for $_" for pdl('[1 1 1 1 2 3 4 4 4 4 4 4]'), pdl('[1 2 2 2 3 3 3 3 4 4 5 5]'), pdl('[1 1 1 2 2 3 3 4 4 5 5 5]'); my $a_bad = sequence 6; $a_bad->setbadat(-1); is_pdl $a_bad->stdv, pdl( 1.4142135623731 ), "standard deviation of $a_bad"; is_pdl $a_bad->stdv_unbiased, pdl( 1.58113883008419 ), "unbiased standard deviation of $a_bad"; is_pdl $a_bad->var, pdl( 2 ), "variance of $a_bad"; is_pdl $a_bad->var_unbiased, pdl( 2.5 ), "unbiased variance of $a_bad"; is_pdl $a_bad->se, pdl( 0.707106781186548 ), "standard error of $a_bad"; is_pdl $a_bad->ss, pdl( 10 ), "sum of squared deviations from the mean of $a_bad"; is_pdl $a_bad->skew, pdl( 0 ), "sample skewness of $a_bad"; is_pdl $a_bad->skew_unbiased, pdl( 0 ), "unbiased sample skewness of $a_bad"; is_pdl $a_bad->kurt, pdl( -1.3 ), "sample kurtosis of $a_bad"; is_pdl $a_bad->kurt_unbiased, pdl( -1.2 ), "unbiased sample kurtosis of $a_bad"; my $b = pdl '0 0 0 1 1'; is_pdl $a->cov($b), pdl( 0.6 ), "sample covariance of $a and $b"; is_pdl $a->corr($b), pdl( 0.866025403784439 ), "Pearson correlation coefficient of $a and $b"; is_pdl $a->n_pair($b), indx( 5 ), "Number of good pairs between $a and $b"; is_pdl $a->corr($b)->t_corr( 5 ), pdl( 3 ), "t significance test of Pearson correlation coefficient of $a and $b"; is_pdl $a->corr_dev($b), pdl( 0.903696114115064 ), "correlation calculated from dev_m values of $a and $b"; my $b_bad = pdl 'BAD 0 0 1 1 1'; is_pdl $a_bad->cov($b_bad), pdl( 0.5 ), "sample covariance with bad data of $a_bad and $b_bad"; is_pdl $a_bad->corr($b_bad), pdl( 0.894427190999916 ), "Pearson correlation coefficient with bad data of $a_bad and $b_bad"; is_pdl $a_bad->n_pair($b_bad), indx( 4 ), "Number of good pairs between $a_bad and $b_bad with bad values taken into account"; is_pdl $a_bad->corr($b_bad)->t_corr( 4 ), pdl( 2.82842712474619 ), "t signifiance test of Pearson correlation coefficient with bad data of $a_bad and $b_bad"; is_pdl $a_bad->corr_dev($b_bad), pdl( 0.903696114115064 ), "correlation calculated from dev_m values with bad data of $a_bad and $b_bad"; my ($t, $df) = $a->t_test($b); is_pdl $t, pdl( 2.1380899352994 ), "t-test between $a and $b - 't' output"; is_pdl $df, pdl( 8 ), "t-test between $a and $b - 'df' output"; ($t, $df) = $a->t_test_nev($b); is_pdl $t, pdl( 2.1380899352994 ), "t-test with non-equal variance between $a and $b - 't' output"; is_pdl $df, pdl( 4.94637223974763 ), "t-test with non-equal variance between $a and $b - 'df' output"; ($t, $df) = $a->t_test_paired($b); is_pdl $t, pdl( 3.13785816221094 ), "paired sample t-test between $a and $b - 't' output"; is_pdl $df, pdl( 4 ), "paired sample t-test between $a and $b - 'df' output"; ($t, $df) = $a_bad->t_test($b_bad); is_pdl $t, pdl( 1.87082869338697 ), "t-test with bad values between $a_bad and $b_bad - 't' output"; is_pdl $df, pdl( 8 ), "t-test with bad values between $a_bad and $b_bad - 'd' output"; ($t, $df) = $a_bad->t_test_nev($b_bad); is_pdl $t, pdl( 1.87082869338697 ), "t-test with non-equal variance with bad values between $a_bad and $b_bad - 't' output"; is_pdl $df, pdl( 4.94637223974763 ), "t-test with non-equal variance with bad values between $a_bad and $b_bad - 'df' output"; ($t, $df) = $a_bad->t_test_paired($b_bad); is_pdl $t, pdl( 4.89897948556636 ), "paired sample t-test with bad values between $a_bad and $b_bad - 't' output"; is_pdl $df, pdl( 3 ), "paired sample t-test with bad values between $a_bad and $b_bad - 'df' output"; { my ($data, $idv, $ido) = rtable(\*DATA, {V=>0}); is_pdl $data, pdl ' [ 5 BAD BAD 2 BAD 5 BAD 9 4 BAD BAD BAD 5 BAD] [ 7 BAD 3 7 0 BAD 0 8 BAD 0 3 0 BAD 0] [BAD BAD BAD BAD BAD 1 BAD 1 BAD BAD BAD BAD 1 BAD] [BAD BAD BAD BAD BAD 0 BAD 5 BAD BAD BAD BAD 0 BAD] [BAD BAD 0 BAD 2 BAD 0 BAD BAD 0 0 2 BAD 0] '; } { my $a = pdl ' 0.045 0.682 0.290 0.024 0.598 0.321 0.772 0.375 0.237 0.811; 0.356 0.094 0.925 0.139 0.701 0.849 0.689 0.109 0.240 0.847; 0.822 0.492 0.351 0.860 0.400 0.243 0.313 0.011 0.437 0.480 '; is_pdl $a->cov_table, $a->cov($a->dummy(1)), 'cov_table'; $a->setbadat(4,0); is_pdl $a->cov_table, $a->cov($a->dummy(1)), 'cov_table bad val'; } { my $a = pdl ' 0.045 0.682 0.290 0.024 0.598 0.321 0.772 0.375 0.237 0.811; 0.356 0.094 0.925 0.139 0.701 0.849 0.689 0.109 0.240 0.847; 0.822 0.492 0.351 0.860 0.400 0.243 0.313 0.011 0.437 0.480 '; is_pdl $a->corr_table, $a->corr($a->dummy(1)), "Square Pearson correlation table"; $a->setbadat(4,0); is_pdl $a->corr_table, $a->corr($a->dummy(1)), "Square Pearson correlation table with bad data"; } { my $a = pdl([0,1,2,3,4], [0,0,0,0,0]); $a = $a->setvaltobad(0); ok $a->stdv->nbad, "Bad value input to stdv makes the stdv itself bad"; } SKIP: { eval { require PDL::Core; require PDL::GSL::CDF; }; skip 'no PDL::GSL::CDF', 1 if $@; my $x = pdl(1, 2); my $n = pdl(2, 10); my $p = .5; is_pdl binomial_test( $x,$n,$p ), pdl(0.75, 0.9892578125), 'binomial_test'; } { my $a = sequence 10, 2; my $factor = sequence(10) > 4; my $ans = pdl( [[0..4], [10..14]], [[5..9], [15..19]] ); my ($a_, $l) = $a->group_by($factor); is_pdl $a_, $ans, 'group_by single factor equal n'; is_deeply $l, [0, 1], 'group_by single factor label'; $a = sequence 10,2; $factor = qsort sequence(10) % 3; $ans = pdl( [1.5, 11.5], [5, 15], [8, 18] ); is_pdl $a->group_by($factor)->average, $ans, 'group_by single factor unequal n'; $a = sequence 10; my @factors = ( [qw( a a a a b b b b b b )], [qw(0 1 0 1 0 1 0 1 0 1)] ); $ans = pdl '[ 0 2 BAD; 1 3 BAD ], [ 4 6 8; 5 7 9 ]'; ($a_, $l) = $a->group_by( @factors ); is_pdl $a_, $ans, 'group_by multiple factors'; is_deeply $l, [[qw(a_0 a_1)], [qw( b_0 b_1 )]], 'group_by multiple factors label'; } { my @a = qw(a a b b c c); my $a = PDL::Stats::Basic::code_ivs( \@a ); my $ans = pdl( 0,0,1,1,2,2 ); is_pdl $a, $ans, 'code_ivs'; $a[-1] = undef; my $a_bad = PDL::Stats::Basic::code_ivs( \@a ); my $ans_bad = pdl '0 0 1 1 2 BAD'; is_pdl $a_bad, $ans_bad, 'code_ivs with missing value undef correctly coded'; $a[-1] = 'BAD'; $a_bad = PDL::Stats::Basic::code_ivs( \@a ); is_pdl $a_bad, $ans_bad, 'code_ivs with missing value BAD correctly coded'; } done_testing(); __DATA__ 999 90 91 92 93 94 70 5 7 -999 -999 -999 711 trying 71 -999 3 -999 -999 0 72 2 7 -999 -999 -999 73 -999 0 -999 -999 2 74 5 -999 1 0 -999 75 -999 0 -999 -999 0 76 9 8 1 5 -999 77 4 -999 -999 -999 -999 78 -999 0 -999 -999 0 79 -999 3 -999 -999 0 80 -999 0 -999 -999 2 81 5 -999 1 0 -999 82 -999 0 -999 -999 0 PDL-Stats-0.855/t/00-report-prereqs.t0000644000175000017500000001347614544645567017072 0ustar osboxesosboxesuse strict; use warnings; # This test was generated by Dist::Zilla::Plugin::Test::ReportPrereqs 0.020 # THEN modified with more info by Ed J for PDL project use Test::More tests => 1; use ExtUtils::MakeMaker; use File::Spec; # from $version::LAX my $lax_version_re = qr/(?: undef | (?: (?:[0-9]+) (?: \. | (?:\.[0-9]+) (?:_[0-9]+)? )? | (?:\.[0-9]+) (?:_[0-9]+)? ) | (?: v (?:[0-9]+) (?: (?:\.[0-9]+)+ (?:_[0-9]+)? )? | (?:[0-9]+)? (?:\.[0-9]+){2,} (?:_[0-9]+)? ) )/x; # hide optional CPAN::Meta modules from prereq scanner # and check if they are available my $cpan_meta = "CPAN::Meta"; my $cpan_meta_pre = "CPAN::Meta::Prereqs"; my $HAS_CPAN_META = eval "require $cpan_meta; $cpan_meta->VERSION('2.120900')" && eval "require $cpan_meta_pre"; ## no critic # Verify requirements? my $DO_VERIFY_PREREQS = 1; sub _max { my $max = shift; $max = ( $_ > $max ) ? $_ : $max for @_; return $max; } sub _merge_prereqs { my ($collector, $prereqs) = @_; # CPAN::Meta::Prereqs object if (ref $collector eq $cpan_meta_pre) { return $collector->with_merged_prereqs( CPAN::Meta::Prereqs->new( $prereqs ) ); } # Raw hashrefs for my $phase ( keys %$prereqs ) { for my $type ( keys %{ $prereqs->{$phase} } ) { for my $module ( keys %{ $prereqs->{$phase}{$type} } ) { $collector->{$phase}{$type}{$module} = $prereqs->{$phase}{$type}{$module}; } } } return $collector; } my @include = qw( ); my @exclude = qw( ); # Add static prereqs to the included modules list my $static_prereqs = do 't/00-report-prereqs.dd'; # Merge all prereqs (either with ::Prereqs or a hashref) my $full_prereqs = _merge_prereqs( ( $HAS_CPAN_META ? $cpan_meta_pre->new : {} ), $static_prereqs ); # Add dynamic prereqs to the included modules list (if we can) my ($source) = grep { -f } 'MYMETA.json', 'MYMETA.yml'; if ( $source && $HAS_CPAN_META ) { if ( my $meta = eval { CPAN::Meta->load_file($source) } ) { $full_prereqs = _merge_prereqs($full_prereqs, $meta->prereqs); } } else { $source = 'static metadata'; } my @full_reports; my @dep_errors; my $req_hash = $HAS_CPAN_META ? $full_prereqs->as_string_hash : $full_prereqs; # Add static includes into a fake section for my $mod (@include) { $req_hash->{other}{modules}{$mod} = 0; } for my $phase ( qw(configure build test runtime develop other) ) { next unless $req_hash->{$phase}; next if ($phase eq 'develop' and not $ENV{AUTHOR_TESTING}); for my $type ( qw(requires recommends suggests conflicts modules) ) { next unless $req_hash->{$phase}{$type}; my $title = ucfirst($phase).' '.ucfirst($type); my @reports = [qw/Module Want Have Where Howbig/]; for my $mod ( sort keys %{ $req_hash->{$phase}{$type} } ) { next if $mod eq 'perl'; next if grep { $_ eq $mod } @exclude; my $file = $mod; $file =~ s{::}{/}g; $file .= ".pm"; my ($prefix) = grep { -e File::Spec->catfile($_, $file) } @INC; my $want = $req_hash->{$phase}{$type}{$mod}; $want = "undef" unless defined $want; $want = "any" if !$want && $want == 0; my $req_string = $want eq 'any' ? 'any version required' : "version '$want' required"; if ($prefix) { my $filename = File::Spec->catfile($prefix, $file); my $have = MM->parse_version( $filename ); $have = "undef" unless defined $have; push @reports, [$mod, $want, $have, $prefix, (-s $filename)]; if ( $DO_VERIFY_PREREQS && $HAS_CPAN_META && $type eq 'requires' ) { if ( $have !~ /\A$lax_version_re\z/ ) { push @dep_errors, "$mod version '$have' cannot be parsed ($req_string)"; } elsif ( ! $full_prereqs->requirements_for( $phase, $type )->accepts_module( $mod => $have ) ) { push @dep_errors, "$mod version '$have' is not in required range '$want'"; } } } else { push @reports, [$mod, $want, "missing", '', 0]; if ( $DO_VERIFY_PREREQS && $type eq 'requires' ) { push @dep_errors, "$mod is not installed ($req_string)"; } } } if ( @reports ) { push @full_reports, "=== $title ===\n\n"; my $ml = _max( map { length $_->[0] } @reports ); my $wl = _max( map { length $_->[1] } @reports ); my $hl = _max( map { length $_->[2] } @reports ); my $ll = _max( map { length $_->[3] } @reports ); # location my $sl = _max( map { length $_->[4] } @reports ); # size if ($type eq 'modules') { splice @reports, 1, 0, ["-" x $ml, "", "-" x $hl, "-" x $ll, "-" x $sl]; push @full_reports, map { sprintf(" %*s %*s\n", -$ml, $_->[0], $hl, $_->[2]) } @reports; } else { splice @reports, 1, 0, ["-" x $ml, "-" x $wl, "-" x $hl, "-" x $ll, "-" x $sl]; push @full_reports, map { sprintf(" %*s %*s %*s %*s %*s\n", -$ml, $_->[0], $wl, $_->[1], $hl, $_->[2], -$ll, $_->[3], $sl, $_->[4]) } @reports; } push @full_reports, "\n"; } } } if ( @full_reports ) { diag "\nVersions for all modules listed in $source (including optional ones):\n\n", @full_reports; } if ( @dep_errors ) { diag join("\n", "\n*** WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING ***\n", "The following REQUIRED prerequisites were not satisfied:\n", @dep_errors, "\n" ); } pass; # vim: ts=4 sts=4 sw=4 et: PDL-Stats-0.855/t/ts.t0000644000175000017500000000543514752576216014300 0ustar osboxesosboxesuse strict; use warnings; use Test::More; use PDL::LiteF; use PDL::Stats::TS; use Test::PDL; { my $a = sequence 10; is_pdl $a->acvf(4), pdl('82.5 57.75 34 12.25 -6.5'), "autocovariance on $a"; is_pdl $a->acf(4), pdl('1 0.7 0.41212121 0.14848485 -0.078787879'), "autocorrelation on $a"; is_pdl $a->filter_ma(2), pdl( '.6 1.2 2 3 4 5 6 7 7.8 8.4'), "filter moving average on $a"; is_pdl $a->filter_exp(.8), pdl('0 0.8 1.76 2.752 3.7504 4.75008 5.750016 6.7500032 7.7500006 8.7500001'), "filter with exponential smoothing on $a"; is_pdl $a->acf(5)->portmanteau($a->nelem), pdl( 11.1753902662994 ), "portmanteau significance test on $a"; my $b = pdl '1 2 3 4 5 6 7 9 9 10'; is_pdl $b->mape($a), pdl( 0.302619047619048 ), "mean absolute percent error between $a and $b"; is_pdl $b->mae($a), pdl( 1.1 ), "mean absolute error between $a and $b"; $b = $b->setbadat(3); is_pdl $b->mape($a), pdl( 0.308465608465608 ), "mean absolute percent error with bad data between $a and $b"; is_pdl $b->mae($a), pdl( 1.11111111111111 ), "mean absolute error with bad data between $a and $b"; } { my $a = sequence(5)->dummy(1,2)->flat->sever; is_pdl $a->dseason(5), pdl('0.6 1.2 2 2 2 2 2 2 2.8 3.4'), "deseasonalize data on $a with period 5"; is_pdl $a->dseason(4), pdl('0.5 1.125 2 2.375 2.125 1.875 1.625 2 2.875 3.5'), "deseasonalize data on $a with period 4"; $a = $a->setbadat(4); is_pdl $a->dseason(5), pdl('0.6 1.2 1.5 1.5 1.5 1.5 1.5 2 2.8 3.4'), "deseasonalize data with bad data on $a with period 5"; is_pdl $a->dseason(4), pdl('0.5 1.125 2 1.8333333 1.5 1.1666667 1.5 2 2.875 3.5'), "deseasonalized data with bad data on $a with period 4"; } { my $a = pdl '0 1 BAD 3; 4 5 BAD 7'; my $a_ans = pdl( [qw( 0 1 1.75 3)], [qw( 4 5 5.75 7 )], ); is_pdl $a->fill_ma(2), $a_ans, "fill missing data with moving average"; } { my $x = sequence 2; is_pdl $x->pred_ar(pdl(.8, -.2, .3), 7), pdl('[0 1 1.1 0.74 0.492 0.3656 0.31408]'), "predict autoregressive series"; is_pdl $x->pred_ar(pdl(.8, -.2), 7, {const=>0}), pdl('[0 1 0.8 0.44 0.192 0.0656 0.01408]'), "predict autoregressive series w/no constant last value"; } { my $a = sequence 10; my $b = pdl( qw(0 1 1 1 3 6 7 7 9 10) ); is_pdl $a->wmape($b), pdl(0.177777777777778), "weighted mean absolute percent error between $a and $b"; $a = $a->setbadat(4); is_pdl $a->wmape($b), pdl(0.170731707317073), "weighted mean absolute percent error with bad data between $a and $b"; } { my $a = pdl '0 3 2 3 4 0 1 2 3 4 0 1 2 3 4; 0 3 2 3 0 0 1 2 3 4 0 1 2 3 4'; my $ans_m = pdl('4 0 1.6666667 2 3; 2.6666667 0 1.6666667 2 3'); my $ans_ms = pdl('0 0 0.88888889 0 0; 3.5555556 0 0.88888889 0 0'); my ($m, $ms) = $a->season_m( 5, {start_position=>1, plot=>0} ); is_pdl $m, $ans_m, 'season_m m'; is_pdl $ms, $ans_ms, 'season_m ms'; } done_testing; PDL-Stats-0.855/README.md0000644000175000017500000000604014750173363014464 0ustar osboxesosboxes# PDL-Stats | Build status | | ------------- | | ![Build Status](https://github.com/PDLPorters/PDL-Stats/workflows/perl/badge.svg?branch=master) | [![Coverage Status](https://coveralls.io/repos/PDLPorters/PDL-Stats/badge.svg?branch=master&service=github)](https://coveralls.io/github/PDLPorters/PDL-Stats?branch=master) [![CPAN version](https://badge.fury.io/pl/PDL-Stats.svg)](https://metacpan.org/pod/PDL::Stats) This is a collection of statistics modules in Perl Data Language, with a quick-start guide for non-PDL people. They make perldl--the simple shell for PDL--work like a teenie weenie R, but with PDL broadcasting--"the fast (and automagic) vectorised iteration of 'elementary operations' over arbitrary slices of multidimensional data"--on procedures including t-test, ordinary least squares regression, and kmeans. Of course, they also work in perl scripts. ## DEPENDENCIES - PDL Perl Data Language. The required PDL version is 2.096. - PDL::GSL (Optional) PDL interface to GNU Scientific Library. This provides PDL::Stats::Distr and PDL::GSL::CDF, the latter of which provides p-values for PDL::Stats::GLM. GSL is otherwise NOT required for the core PDL::Stats modules to work, ie Basic, Kmeans, and GLM. - PDL::Graphics::Simple (Optional) PDL-Stats currently uses this for plotting. It can use any of several engines to achieve this. ## INSTALLATION ### \*nix For standard perl module installation in \*nix environment form source, to install all included modules, extract the files from the archive by entering this at a shell, tar xvf PDL-Stats-xxx.tar.gz then change to the PDL-Stats directory, cd PDL-Stats-xxx and run the following commands: perl Makefile.PL make make test sudo make install If you don't have permission to run sudo, you can specify an alternative path, perl Makefile.PL PREFIX=/home/user/my_perl_lib make make test make install then add `/home/user/my_perl_lib` to your PERL5LIB environment variable. If you have trouble installing PDL, you can look for help at the PDL wiki or PDL mailing list. ## SUPPORT AND DOCUMENTATION After installing, you can find documentation for the modules with the perldoc command. perldoc PDL::Stats perldoc PDL::Stats::Basic etc. You can also look for information at: Home https://github.com/PDLPorters/PDL-Stats Search CPAN https://metacpan.org/dist/PDL-Stats Mailing list (low traffic, open a GitHub issue instead) https://lists.sourceforge.net/lists/listinfo/pdl-stats-help If you notice a bug or have a request, please submit a report at [https://github.com/PDLPorters/PDL-Stats/issues](https://github.com/PDLPorters/PDL-Stats/issues) If you would like to help develop or maintain the package, please email me at the address below. ## COPYRIGHT AND LICENCE Copyright (C) 2009-2012 Maggie J. Xiong All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution. PDL-Stats-0.855/GENERATED/0000755000175000017500000000000014762214364014503 5ustar osboxesosboxesPDL-Stats-0.855/GENERATED/PDL/0000755000175000017500000000000014762214364015122 5ustar osboxesosboxesPDL-Stats-0.855/GENERATED/PDL/Stats/0000755000175000017500000000000014762214365016221 5ustar osboxesosboxesPDL-Stats-0.855/GENERATED/PDL/Stats/Basic.pm0000644000175000017500000006204114762214365017603 0ustar osboxesosboxes# # GENERATED WITH PDL::PP from lib/PDL/Stats/Basic.pd! Don't modify! # package PDL::Stats::Basic; our @EXPORT_OK = qw(binomial_test rtable which_id code_ivs stdv stdv_unbiased var var_unbiased se ss skew skew_unbiased kurt kurt_unbiased cov cov_table corr corr_table t_corr n_pair corr_dev t_test t_test_nev t_test_paired ); our %EXPORT_TAGS = (Func=>\@EXPORT_OK); use PDL::Core; use PDL::Exporter; use DynaLoader; our @ISA = ( 'PDL::Exporter','DynaLoader' ); push @PDL::Core::PP, __PACKAGE__; bootstrap PDL::Stats::Basic ; #line 9 "lib/PDL/Stats/Basic.pd" use strict; use warnings; use PDL::LiteF; use Carp; eval { require PDL::Core; require PDL::GSL::CDF; }; my $CDF = 1 if !$@; =head1 NAME PDL::Stats::Basic -- basic statistics and related utilities such as standard deviation, Pearson correlation, and t-tests. =head1 DESCRIPTION The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are broadcastable and methods that are NOT broadcastable, respectively. Does not have mean or median function here. see SEE ALSO. =head1 SYNOPSIS use PDL::LiteF; use PDL::Stats::Basic; my $stdv = $data->stdv; or my $stdv = stdv( $data ); =cut #line 59 "lib/PDL/Stats/Basic.pm" =head1 FUNCTIONS =cut =head2 stdv =for sig Signature: (a(n); [o]b()) Types: (float double) =for usage $b = stdv($a); stdv($a, $b); # all arguments given $b = $a->stdv; # method call $a->stdv($b); =for ref Sample standard deviation. =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *stdv = \&PDL::stdv; =head2 stdv_unbiased =for sig Signature: (a(n); [o]b()) Types: (float double) =for usage $b = stdv_unbiased($a); stdv_unbiased($a, $b); # all arguments given $b = $a->stdv_unbiased; # method call $a->stdv_unbiased($b); =for ref Unbiased estimate of population standard deviation. =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *stdv_unbiased = \&PDL::stdv_unbiased; =head2 var =for sig Signature: (a(n); [o]b()) Types: (float double) =for usage $b = var($a); var($a, $b); # all arguments given $b = $a->var; # method call $a->var($b); =for ref Sample variance. =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *var = \&PDL::var; =head2 var_unbiased =for sig Signature: (a(n); [o]b()) Types: (float double) =for usage $b = var_unbiased($a); var_unbiased($a, $b); # all arguments given $b = $a->var_unbiased; # method call $a->var_unbiased($b); =for ref Unbiased estimate of population variance. =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *var_unbiased = \&PDL::var_unbiased; =head2 se =for sig Signature: (a(n); [o]b()) Types: (float double) =for usage $b = se($a); se($a, $b); # all arguments given $b = $a->se; # method call $a->se($b); =for ref Standard error of the mean. Useful for calculating confidence intervals. =for example # 95% confidence interval for samples with large N $ci_95_upper = $data->average + 1.96 * $data->se; $ci_95_lower = $data->average - 1.96 * $data->se; =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *se = \&PDL::se; =head2 ss =for sig Signature: (a(n); [o]b()) Types: (float double) =for usage $b = ss($a); ss($a, $b); # all arguments given $b = $a->ss; # method call $a->ss($b); =for ref Sum of squared deviations from the mean. =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *ss = \&PDL::ss; =head2 skew =for sig Signature: (a(n); [o]b()) Types: (float double) =for usage $b = skew($a); skew($a, $b); # all arguments given $b = $a->skew; # method call $a->skew($b); =for ref Sample skewness, measure of asymmetry in data. skewness == 0 for normal distribution. =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *skew = \&PDL::skew; =head2 skew_unbiased =for sig Signature: (a(n); [o]b()) Types: (float double) =for usage $b = skew_unbiased($a); skew_unbiased($a, $b); # all arguments given $b = $a->skew_unbiased; # method call $a->skew_unbiased($b); =for ref Unbiased estimate of population skewness. This is the number in GNumeric Descriptive Statistics. =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *skew_unbiased = \&PDL::skew_unbiased; =head2 kurt =for sig Signature: (a(n); [o]b()) Types: (float double) =for usage $b = kurt($a); kurt($a, $b); # all arguments given $b = $a->kurt; # method call $a->kurt($b); =for ref Sample kurtosis, measure of "peakedness" of data. kurtosis == 0 for normal distribution. =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *kurt = \&PDL::kurt; =head2 kurt_unbiased =for sig Signature: (a(n); [o]b()) Types: (float double) =for usage $b = kurt_unbiased($a); kurt_unbiased($a, $b); # all arguments given $b = $a->kurt_unbiased; # method call $a->kurt_unbiased($b); =for ref Unbiased estimate of population kurtosis. This is the number in GNumeric Descriptive Statistics. =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *kurt_unbiased = \&PDL::kurt_unbiased; =head2 cov =for sig Signature: (a(n); b(n); [o]c()) Types: (float double) =for usage $c = cov($a, $b); cov($a, $b, $c); # all arguments given $c = $a->cov($b); # method call $a->cov($b, $c); =for ref Sample covariance. see B for ways to call =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *cov = \&PDL::cov; =head2 cov_table =for sig Signature: (a(n,m); [o]c(m,m)) Types: (sbyte byte short ushort long ulong indx ulonglong longlong float double ldouble) =for usage $c = cov_table($a); cov_table($a, $c); # all arguments given $c = $a->cov_table; # method call $a->cov_table($c); =for ref Square covariance table. Gives the same result as broadcasting using B but it calculates only half the square, hence much faster. And it is easier to use with higher dimension pdls. =for example Usage: # 5 obs x 3 var, 2 such data tables pdl> $a = random 5, 3, 2 pdl> p $cov = $a->cov_table [ [ [ 8.9636438 -1.8624472 -1.2416588] [-1.8624472 14.341514 -1.4245366] [-1.2416588 -1.4245366 9.8690655] ] [ [ 10.32644 -0.31311789 -0.95643674] [-0.31311789 15.051779 -7.2759577] [-0.95643674 -7.2759577 5.4465141] ] ] # diagonal elements of the cov table are the variances pdl> p $a->var [ [ 8.9636438 14.341514 9.8690655] [ 10.32644 15.051779 5.4465141] ] for the same cov matrix table using B, pdl> p $a->dummy(2)->cov($a->dummy(1)) =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *cov_table = \&PDL::cov_table; =head2 corr =for sig Signature: (a(n); b(n); [o]c()) Types: (float double) =for usage $c = corr($a, $b); corr($a, $b, $c); # all arguments given $c = $a->corr($b); # method call $a->corr($b, $c); =for ref Pearson correlation coefficient. r = cov(X,Y) / (stdv(X) * stdv(Y)). =for example Usage: pdl> $a = random 5, 3 pdl> $b = sequence 5,3 pdl> p $a->corr($b) [0.20934208 0.30949881 0.26713007] for square corr table pdl> p $a->corr($a->dummy(1)) [ [ 1 -0.41995259 -0.029301192] [ -0.41995259 1 -0.61927619] [-0.029301192 -0.61927619 1] ] but it is easier and faster to use B. =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *corr = \&PDL::corr; =head2 corr_table =for sig Signature: (a(n,m); [o]c(m,m)) Types: (sbyte byte short ushort long ulong indx ulonglong longlong float double ldouble) =for usage $c = corr_table($a); corr_table($a, $c); # all arguments given $c = $a->corr_table; # method call $a->corr_table($c); =for ref Square Pearson correlation table. Gives the same result as broadcasting using B but it calculates only half the square, hence much faster. And it is easier to use with higher dimension pdls. =for example Usage: # 5 obs x 3 var, 2 such data tables pdl> $a = random 5, 3, 2 pdl> p $a->corr_table [ [ [ 1 -0.69835951 -0.18549048] [-0.69835951 1 0.72481605] [-0.18549048 0.72481605 1] ] [ [ 1 0.82722569 -0.71779883] [ 0.82722569 1 -0.63938828] [-0.71779883 -0.63938828 1] ] ] for the same result using B, pdl> p $a->dummy(2)->corr($a->dummy(1)) This is also how to use B and B with such a table. =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *corr_table = \&PDL::corr_table; =head2 t_corr =for sig Signature: (r(); n(); [o]t()) Types: (float double) =for usage $t = t_corr($r, $n); t_corr($r, $n, $t); # all arguments given $t = $r->t_corr($n); # method call $r->t_corr($n, $t); =for ref t significance test for Pearson correlations. =for example $corr = $data->corr( $data->dummy(1) ); $n = $data->n_pair( $data->dummy(1) ); $t_corr = $corr->t_corr( $n ); use PDL::GSL::CDF; $p_2tail = 2 * (1 - gsl_cdf_tdist_P( $t_corr->abs, $n-2 )); =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *t_corr = \&PDL::t_corr; =head2 n_pair =for sig Signature: (a(n); b(n); indx [o]c()) Types: (long longlong) =for usage $c = n_pair($a, $b); n_pair($a, $b, $c); # all arguments given $c = $a->n_pair($b); # method call $a->n_pair($b, $c); =for ref Returns the number of good pairs between 2 lists. Useful with B (esp. when bad values are involved) =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *n_pair = \&PDL::n_pair; =head2 corr_dev =for sig Signature: (a(n); b(n); [o]c()) Types: (float double) =for usage $c = corr_dev($a, $b); corr_dev($a, $b, $c); # all arguments given $c = $a->corr_dev($b); # method call $a->corr_dev($b, $c); =for ref Calculates correlations from B vals. Seems faster than doing B from original vals when data pdl is big =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *corr_dev = \&PDL::corr_dev; =head2 t_test =for sig Signature: (a(n); b(m); [o]t(); [o]d()) Types: (float double) =for usage ($t, $d) = t_test($a, $b); t_test($a, $b, $t, $d); # all arguments given ($t, $d) = $a->t_test($b); # method call $a->t_test($b, $t, $d); =for ref Independent sample t-test, assuming equal var. =for example my ($t, $df) = t_test( $pdl1, $pdl2 ); use PDL::GSL::CDF; my $p_2tail = 2 * (1 - gsl_cdf_tdist_P( $t->abs, $df )); =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *t_test = \&PDL::t_test; =head2 t_test_nev =for sig Signature: (a(n); b(m); [o]t(); [o]d()) Types: (float double) =for usage ($t, $d) = t_test_nev($a, $b); t_test_nev($a, $b, $t, $d); # all arguments given ($t, $d) = $a->t_test_nev($b); # method call $a->t_test_nev($b, $t, $d); =for ref Independent sample t-test, NOT assuming equal var. ie Welch two sample t test. Df follows Welch-Satterthwaite equation instead of Satterthwaite (1946, as cited by Hays, 1994, 5th ed.). It matches GNumeric, which matches R. =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *t_test_nev = \&PDL::t_test_nev; =head2 t_test_paired =for sig Signature: (a(n); b(n); [o]t(); [o]d()) Types: (float double) =for usage ($t, $d) = t_test_paired($a, $b); t_test_paired($a, $b, $t, $d); # all arguments given ($t, $d) = $a->t_test_paired($b); # method call $a->t_test_paired($b, $t, $d); =for ref Paired sample t-test. =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *t_test_paired = \&PDL::t_test_paired; #line 658 "lib/PDL/Stats/Basic.pd" #line 659 "lib/PDL/Stats/Basic.pd" =head2 binomial_test =for Sig Signature: (x(); n(); p_expected(); [o]p()) =for ref Binomial test. One-tailed significance test for two-outcome distribution. Given the number of successes, the number of trials, and the expected probability of success, returns the probability of getting this many or more successes. This function does NOT currently support bad value in the number of successes. =for example Usage: # assume a fair coin, ie. 0.5 probablity of getting heads # test whether getting 8 heads out of 10 coin flips is unusual my $p = binomial_test( 8, 10, 0.5 ); # 0.0107421875. Yes it is unusual. =cut *binomial_test = \&PDL::binomial_test; sub PDL::binomial_test { my ($x, $n, $P) = @_; carp 'Please install PDL::GSL::CDF.' unless $CDF; carp 'This function does NOT currently support bad value in the number of successes.' if $x->badflag(); my $pdlx = pdl($x); $pdlx->badflag(1); $pdlx = $pdlx->setvaltobad(0); my $p = 1 - PDL::GSL::CDF::gsl_cdf_binomial_P( $pdlx - 1, $P, $n ); $p = $p->setbadtoval(1); $p->badflag(0); return $p; } =head1 METHODS =head2 rtable =for ref Reads either file or file handle*. Returns observation x variable pdl and var and obs ids if specified. Ids in perl @ ref to allow for non-numeric ids. Other non-numeric entries are treated as missing, which are filled with $opt{MISSN} then set to BAD*. Can specify num of data rows to read from top but not arbitrary range. *If passed handle, it will not be closed here. =for options Default options (case insensitive): V => 1, # verbose. prints simple status TYPE => double, C_ID => 1, # boolean. file has col id. R_ID => 1, # boolean. file has row id. R_VAR => 0, # boolean. set to 1 if var in rows SEP => "\t", # can take regex qr// MISSN => -999, # this value treated as missing and set to BAD NROW => '', # set to read specified num of data rows =for usage Usage: Sample file diet.txt: uid height weight diet akw 72 320 1 bcm 68 268 1 clq 67 180 2 dwm 70 200 2 ($data, $idv, $ido) = rtable 'diet.txt'; # By default prints out data info and @$idv index and element reading diet.txt for data and id... OK. data table as PDL dim o x v: PDL: Double D [4,3] 0 height 1 weight 2 diet Another way of using it, $data = rtable( \*STDIN, {TYPE=>long} ); =cut sub rtable { # returns obs x var data matrix and var and obs ids my ($src, $opt) = @_; my $fh_in; if ($src =~ /STDIN/ or ref $src eq 'GLOB') { $fh_in = $src } else { open $fh_in, $src or croak "$!" } my %opt = ( V => 1, TYPE => double, C_ID => 1, R_ID => 1, R_VAR => 0, SEP => "\t", MISSN => -999, NROW => '', ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } $opt{V} and print "reading $src for data and id... "; local $PDL::undefval = $opt{MISSN}; my $id_c = []; # match declaration of $id_r for return purpose if ($opt{C_ID}) { chomp( $id_c = <$fh_in> ); my @entries = split $opt{SEP}, $id_c; $opt{R_ID} and shift @entries; $id_c = \@entries; } my ($c_row, $id_r, $data, @data) = (0, [], PDL->null, ); while (<$fh_in>) { chomp; my @entries = split /$opt{SEP}/, $_, -1; $opt{R_ID} and push @$id_r, shift @entries; # rudimentary check for numeric entry for (@entries) { $_ = $opt{MISSN} unless defined $_ and m/\d\b/ } push @data, pdl( $opt{TYPE}, \@entries ); $c_row ++; last if $opt{NROW} and $c_row == $opt{NROW}; } # not explicitly closing $fh_in here in case it's passed from outside # $fh_in will close by going out of scope if opened here. $data = pdl $opt{TYPE}, @data; @data = (); # rid of last col unless there is data there $data = $data->slice([0, $data->getdim(0)-2])->sever unless ( nelem $data->slice(-1)->where($data->slice(-1) != $opt{MISSN}) ); my ($idv, $ido) = ($id_r, $id_c); # var in columns instead of rows $opt{R_VAR} == 0 and ($data, $idv, $ido) = ($data->inplace->transpose, $id_c, $id_r); if ($opt{V}) { print "OK.\ndata table as PDL dim o x v: " . $data->info . "\n"; $idv and print "$_\t$$idv[$_]\n" for 0..$#$idv; } $data = $data->setvaltobad( $opt{MISSN} ); $data->check_badflag; return wantarray? (@$idv? ($data, $idv, $ido) : ($data, $ido)) : $data; } =head2 group_by Returns pdl reshaped according to the specified factor variable. Most useful when used in conjunction with other broadcasting calculations such as average, stdv, etc. When the factor variable contains unequal number of cases in each level, the returned pdl is padded with bad values to fit the level with the most number of cases. This allows the subsequent calculation (average, stdv, etc) to return the correct results for each level. Usage: # simple case with 1d pdl and equal number of n in each level of the factor pdl> p $a = sequence 10 [0 1 2 3 4 5 6 7 8 9] pdl> p $factor = $a > 4 [0 0 0 0 0 1 1 1 1 1] pdl> p $a->group_by( $factor )->average [2 7] # more complex case with broadcasting and unequal number of n across levels in the factor pdl> p $a = sequence 10,2 [ [ 0 1 2 3 4 5 6 7 8 9] [10 11 12 13 14 15 16 17 18 19] ] pdl> p $factor = qsort $a( ,0) % 3 [ [0 0 0 0 1 1 1 2 2 2] ] pdl> p $a->group_by( $factor ) [ [ [ 0 1 2 3] [10 11 12 13] ] [ [ 4 5 6 BAD] [ 14 15 16 BAD] ] [ [ 7 8 9 BAD] [ 17 18 19 BAD] ] ] ARRAY(0xa2a4e40) # group_by supports perl factors, multiple factors # returns factor labels in addition to pdl in array context pdl> p $a = sequence 12 [0 1 2 3 4 5 6 7 8 9 10 11] pdl> $odd_even = [qw( e o e o e o e o e o e o )] pdl> $magnitude = [qw( l l l l l l h h h h h h )] pdl> ($a_grouped, $label) = $a->group_by( $odd_even, $magnitude ) pdl> p $a_grouped [ [ [0 2 4] [1 3 5] ] [ [ 6 8 10] [ 7 9 11] ] ] pdl> p Dumper $label $VAR1 = [ [ 'e_l', 'o_l' ], [ 'e_h', 'o_h' ] ]; =cut *group_by = \&PDL::group_by; sub PDL::group_by { my $p = shift; my @factors = @_; if ( @factors == 1 ) { my $factor = $factors[0]; my $label; if (ref $factor eq 'ARRAY') { $label = _ordered_uniq($factor); $factor = code_ivs($factor); } else { my $perl_factor = [$factor->list]; $label = _ordered_uniq($perl_factor); } my $p_reshaped = _group_by_single_factor( $p, $factor ); return wantarray? ($p_reshaped, $label) : $p_reshaped; } # make sure all are arrays instead of pdls @factors = map { ref($_) eq 'PDL'? [$_->list] : $_ } @factors; my (@cells); for my $ele (0 .. $#{$factors[0]}) { my $c = join '_', map { $_->[$ele] } @factors; push @cells, $c; } # get uniq cell labels (ref List::MoreUtils::uniq) my %seen; my @uniq_cells = grep {! $seen{$_}++ } @cells; my $flat_factor = code_ivs( \@cells ); my $p_reshaped = _group_by_single_factor( $p, $flat_factor ); # get levels of each factor and reshape accordingly my @levels; for (@factors) { my %uniq; @uniq{ @$_ } = (); push @levels, scalar keys %uniq; } $p_reshaped = $p_reshaped->reshape( $p_reshaped->dim(0), @levels )->sever; # make labels for the returned data structure matching pdl structure my @labels; if (wantarray) { for my $ifactor (0 .. $#levels) { my @factor_label; for my $ilevel (0 .. $levels[$ifactor]-1) { my $i = $ifactor * $levels[$ifactor] + $ilevel; push @factor_label, $uniq_cells[$i]; } push @labels, \@factor_label; } } return wantarray? ($p_reshaped, \@labels) : $p_reshaped; } # get uniq cell labels (ref List::MoreUtils::uniq) sub _ordered_uniq { my $arr = shift; my %seen; my @uniq = grep { ! $seen{$_}++ } @$arr; return \@uniq; } sub _group_by_single_factor { my $p = shift; my $factor = shift; $factor = $factor->squeeze; die "Currently support only 1d factor pdl." if $factor->ndims > 1; die "Data pdl and factor pdl do not match!" unless $factor->dim(0) == $p->dim(0); # get active dim that will be split according to factor and dims to broadcast over my @p_broadcastdims = $p->dims; my $p_dim0 = shift @p_broadcastdims; my $uniq = $factor->uniq; my @uniq_ns; for ($uniq->list) { push @uniq_ns, which( $factor == $_ )->nelem; } # get number of n's in each group, find the biggest, fit output pdl to this my $uniq_ns = pdl \@uniq_ns; my $max = pdl(\@uniq_ns)->max->sclr; my $badvalue = int($p->max + 1); my $p_tmp = ones($max, @p_broadcastdims, $uniq->nelem) * $badvalue; for (0 .. $#uniq_ns) { my $i = which $factor == $uniq->slice($_); $p_tmp->dice_axis(-1,$_)->squeeze->slice([0,$uniq_ns[$_]-1]) .= $p->slice($i); } $p_tmp->badflag(1); return $p_tmp->setvaltobad($badvalue); } =head2 which_id =for ref Lookup specified var (obs) ids in $idv ($ido) (see B) and return indices in $idv ($ido) as pdl if found. The indices are ordered by the specified subset. Useful for selecting data by var (obs) id. =for usage my $ind = which_id $ido, ['smith', 'summers', 'tesla']; my $data_subset = $data( $ind, ); # take advantage of perl pattern matching # e.g. use data from people whose last name starts with s my $i = which_id $ido, [ grep { /^s/ } @$ido ]; my $data_s = $data($i, ); =cut sub which_id { my ($id, $id_s) = @_; my %ind; @ind{ @$id } = (0 .. $#$id); pdl grep defined, map $ind{$_}, @$id_s; } my %code_bad = map +($_=>1), '', 'BAD'; sub code_ivs { my ($var_ref) = @_; $var_ref = [ $var_ref->list ] if UNIVERSAL::isa($var_ref, 'PDL'); my @filtered = map !defined($_) || $code_bad{$_} ? undef : $_, @$var_ref; my ($l, %level) = 0; $level{$_} //= $l++ for grep defined, @filtered; my $pdl = pdl(map defined($_) ? $level{$_} : -1, @filtered)->setvaltobad(-1); $pdl->check_badflag; wantarray ? ($pdl, \%level) : $pdl; } =head1 SEE ALSO PDL::Basic (hist for frequency counts) PDL::Ufunc (sum, avg, median, min, max, etc.) PDL::GSL::CDF (various cumulative distribution functions) =head1 REFERENCES Hays, W.L. (1994). Statistics (5th ed.). Fort Worth, TX: Harcourt Brace College Publishers. =head1 AUTHOR Copyright (C) 2009 Maggie J. Xiong All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution. =cut #line 1379 "lib/PDL/Stats/Basic.pm" # Exit with OK status 1; PDL-Stats-0.855/GENERATED/PDL/Stats/GLM.pm0000644000175000017500000021120414762214365017176 0ustar osboxesosboxes# # GENERATED WITH PDL::PP from lib/PDL/Stats/GLM.pd! Don't modify! # package PDL::Stats::GLM; our @EXPORT_OK = qw(ols_t ols ols_rptd anova anova_rptd anova_design_matrix dummy_code effect_code effect_code_w interaction_code r2_change logistic pca pca_sorti plot_means plot_residuals plot_screes fill_m fill_rand dev_m stddz sse mse rmse pred_logistic d0 dm dvrs ); our %EXPORT_TAGS = (Func=>\@EXPORT_OK); use PDL::Core; use PDL::Exporter; use DynaLoader; our @ISA = ( 'PDL::Exporter','DynaLoader' ); push @PDL::Core::PP, __PACKAGE__; bootstrap PDL::Stats::GLM ; #line 14 "lib/PDL/Stats/GLM.pd" use strict; use warnings; use Carp; use PDL::LiteF; use PDL::MatrixOps; use PDL::Stats::Basic; use PDL::Stats::Kmeans; eval { require PDL::Core; require PDL::GSL::CDF; }; my $CDF = 1 if !$@; =encoding utf8 =head1 NAME PDL::Stats::GLM -- general and generalized linear modelling methods such as ANOVA, linear regression, PCA, and logistic regression. =head1 SYNOPSIS use PDL::LiteF; use PDL::Stats::GLM; # do a multiple linear regression and plot the residuals my $y = pdl( 8, 7, 7, 0, 2, 5, 0 ); my $x = pdl( [ 0, 1, 2, 3, 4, 5, 6 ], # linear component [ 0, 1, 4, 9, 16, 25, 36 ] ); # quadratic component my %m = $y->ols( $x, {plot=>1} ); print "$_\t$m{$_}\n" for sort keys %m; =head1 DESCRIPTION For more about general linear modelling, see L. For an unbelievably thorough text on experimental design and analysis, including linear modelling, see L. The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are broadcastable and methods that are NOT broadcastable, respectively. FUNCTIONS support bad values. P-values, where appropriate, are provided if PDL::GSL::CDF is installed. =cut #line 75 "lib/PDL/Stats/GLM.pm" =head1 FUNCTIONS =cut =head2 fill_m =for sig Signature: (a(n); [o]b(n)) Types: (float double) =for ref Replaces bad values with sample mean. Mean is set to 0 if all obs are bad. =for usage pdl> p $data [ [ 5 BAD 2 BAD] [ 7 3 7 BAD] ] pdl> p $data->fill_m [ [ 5 3.5 2 3.5] [ 7 3 7 5.66667] ] =pod Broadcasts over its inputs. =for bad The output pdl badflag is cleared. =cut *fill_m = \&PDL::fill_m; =head2 fill_rand =for sig Signature: (a(n); [o]b(n)) Types: (sbyte byte short ushort long ulong indx ulonglong longlong float double ldouble) =for ref Replaces bad values with random sample (with replacement) of good observations from the same variable. =for usage pdl> p $data [ [ 5 BAD 2 BAD] [ 7 3 7 BAD] ] pdl> p $data->fill_rand [ [5 2 2 5] [7 3 7 7] ] =pod Broadcasts over its inputs. =for bad The output pdl badflag is cleared. =cut *fill_rand = \&PDL::fill_rand; =head2 dev_m =for sig Signature: (a(n); [o]b(n)) Types: (float double) =for usage $b = dev_m($a); dev_m($a, $b); # all arguments given $b = $a->dev_m; # method call $a->dev_m($b); $a->inplace->dev_m; # can be used inplace dev_m($a->inplace); =for ref Replaces values with deviations from the mean. =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *dev_m = \&PDL::dev_m; =head2 stddz =for sig Signature: (a(n); [o]b(n)) Types: (float double) =for usage $b = stddz($a); stddz($a, $b); # all arguments given $b = $a->stddz; # method call $a->stddz($b); $a->inplace->stddz; # can be used inplace stddz($a->inplace); =for ref Standardize ie replace values with z_scores based on sample standard deviation from the mean (replace with 0s if stdv==0). =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *stddz = \&PDL::stddz; =head2 sse =for sig Signature: (a(n); b(n); [o]c()) Types: (float double) =for usage $c = sse($a, $b); sse($a, $b, $c); # all arguments given $c = $a->sse($b); # method call $a->sse($b, $c); =for ref Sum of squared errors between actual and predicted values. =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *sse = \&PDL::sse; =head2 mse =for sig Signature: (a(n); b(n); [o]c()) Types: (float double) =for usage $c = mse($a, $b); mse($a, $b, $c); # all arguments given $c = $a->mse($b); # method call $a->mse($b, $c); =for ref Mean of squared errors between actual and predicted values, ie variance around predicted value. =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *mse = \&PDL::mse; =head2 rmse =for sig Signature: (a(n); b(n); [o]c()) Types: (float double) =for usage $c = rmse($a, $b); rmse($a, $b, $c); # all arguments given $c = $a->rmse($b); # method call $a->rmse($b, $c); =for ref Root mean squared error, ie stdv around predicted value. =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *rmse = \&PDL::rmse; =head2 pred_logistic =for sig Signature: (a(n,m); b(m); [o]c(n)) Types: (float double) =for ref Calculates predicted prob value for logistic regression. =for usage # glue constant then apply coeff returned by the logistic method $pred = $x->glue(1,ones($x->dim(0)))->pred_logistic( $m{b} ); =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *pred_logistic = \&PDL::pred_logistic; =head2 d0 =for sig Signature: (a(n); [o]c()) Types: (float double) =for usage $c = d0($a); d0($a, $c); # all arguments given $c = $a->d0; # method call $a->d0($c); =for ref Null deviance for logistic regression. =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *d0 = \&PDL::d0; =head2 dm =for sig Signature: (a(n); b(n); [o]c()) Types: (float double) =for ref Model deviance for logistic regression. =for usage my $dm = $y->dm( $y_pred ); # null deviance my $d0 = $y->dm( ones($y->nelem) * $y->avg ); =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *dm = \&PDL::dm; =head2 dvrs =for sig Signature: (a(); b(); [o]c()) Types: (float double) =for usage $c = dvrs($a, $b); dvrs($a, $b, $c); # all arguments given $c = $a->dvrs($b); # method call $a->dvrs($b, $c); =for ref Deviance residual for logistic regression. =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *dvrs = \&PDL::dvrs; #line 359 "lib/PDL/Stats/GLM.pd" #line 360 "lib/PDL/Stats/GLM.pd" =head2 ols_t =for ref Broadcasted version of ordinary least squares regression (B). The price of broadcasting was losing significance tests for coefficients (but see B). The fitting function was shamelessly copied then modified from PDL::Fit::Linfit. Intercept is FIRST of coeff if CONST => 1. ols_t does not handle bad values. consider B or B if there are bad values. =for options Default options (case insensitive): CONST => 1, =for usage Usage: # DV, 2 person's ratings for top-10 box office movies # ascending sorted by box office numbers pdl> p $y = pdl '1 1 2 4 4 4 4 5 5 5; 1 2 2 2 3 3 3 3 5 5' # model with 2 IVs, a linear and a quadratic trend component pdl> $x = cat sequence(10), sequence(10)**2 # suppose our novice modeler thinks this creates 3 different models # for predicting movie ratings pdl> p $x = cat $x, $x * 2, $x * 3 [ [ [ 0 1 2 3 4 5 6 7 8 9] [ 0 1 4 9 16 25 36 49 64 81] ] [ [ 0 2 4 6 8 10 12 14 16 18] [ 0 2 8 18 32 50 72 98 128 162] ] [ [ 0 3 6 9 12 15 18 21 24 27] [ 0 3 12 27 48 75 108 147 192 243] ] ] pdl> p $x->info PDL: Double D [10,2,3] # insert a dummy dim between IV and the dim (model) to be broadcasted pdl> %m = $y->ols_t( $x->dummy(2) ) pdl> p "$_\t@{[$m{$_} =~ /^\n*(.*?)\n*\z/s]}\n" for sort keys %m # 2 persons' ratings, each fitted with 3 "different" models F [ [ 38.314159 25.087209] [ 38.314159 25.087209] [ 38.314159 25.087209] ] # df is the same across dv and iv models F_df [2 7] F_p [ [0.00016967051 0.00064215074] [0.00016967051 0.00064215074] [0.00016967051 0.00064215074] ] R2 [ [ 0.9162963 0.87756762] [ 0.9162963 0.87756762] [ 0.9162963 0.87756762] ] b [ # constant linear quadratic [ [ 0.66363636 0.99015152 -0.056818182] # person 1 [ 1.4 0.18939394 0.022727273] # person 2 ] [ [ 0.66363636 0.49507576 -0.028409091] [ 1.4 0.09469697 0.011363636] ] [ [ 0.66363636 0.33005051 -0.018939394] [ 1.4 0.063131313 0.0075757576] ] ] # our novice modeler realizes at this point that # the 3 models only differ in the scaling of the IV coefficients ss_model [ [ 20.616667 13.075758] [ 20.616667 13.075758] [ 20.616667 13.075758] ] ss_residual [ [ 1.8833333 1.8242424] [ 1.8833333 1.8242424] [ 1.8833333 1.8242424] ] ss_total [22.5 14.9] y_pred [ [ [0.66363636 1.5969697 2.4166667 3.1227273 ... 4.9727273] ... =cut *ols_t = \&PDL::ols_t; sub PDL::ols_t { _ols_common(1, @_); } sub _ols_common { my ($broadcasted, $y, $ivs, $opt) = @_; ($y, $ivs) = _ols_prep_inputs(@_); _ols_main($broadcasted, $y, $ivs, $opt); } sub _ols_prep_inputs { # y [n], ivs [n x attr] pdl my ($broadcasted, $y, $ivs, $opt) = @_; my %opt = ( CONST => 1, PLOT => 0, WIN => undef, # for plotting ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } if (!$broadcasted) { $y = $y->squeeze; $y->getndims > 1 and croak "use ols_t for broadcasted version"; } $ivs = $ivs->dummy(1) if $ivs->getndims == 1; ($y, $ivs) = _rm_bad_value( $y, $ivs ) if !$broadcasted; # set up ivs and const as ivs $opt{CONST} and $ivs = ones($ivs->dim(0))->glue( 1, $ivs ); ($y, $ivs); } sub _ols_main { my ($broadcasted, $y, $ivs, $opt) = @_; my %opt = ( CONST => 1, PLOT => 0, WIN => undef, # for plotting ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } my $C = inv( $ivs x $ivs->t ); # Internally normalise data # (double) it or ushort y and sequence iv won't work right my $ymean = $y->abs->avgover->double; $ymean->where( $ymean==0 ) .= 1; my $divisor = $broadcasted ? $ymean->dummy(0) : $ymean; my $y2 = $y / $divisor; my $Y = $ivs x $y2->dummy(0); # Do the fit # Fitted coefficients vector my $coeff = PDL::squeeze( $C x $Y ); $coeff = $coeff->dummy(0) if $broadcasted and $coeff->getndims == 1 and $y->getndims > 1; $coeff *= $divisor; # Un-normalise # ***$coeff x $ivs looks nice but produces nan on successive tries*** my $y_pred = sumover( ($broadcasted ? $coeff->dummy(1) : $coeff) * $ivs->transpose ); $opt{PLOT} and $y->plot_residuals( $y_pred, \%opt ); return $coeff unless wantarray; my %ret = (y_pred => $y_pred); $ret{ss_total} = $opt{CONST} ? $y->ss : sumover( $y ** 2 ); $ret{ss_residual} = $y->sse( $ret{y_pred} ); $ret{ss_model} = $ret{ss_total} - $ret{ss_residual}; $ret{R2} = $ret{ss_model} / $ret{ss_total}; my $n_var = $opt{CONST} ? $ivs->dim(1) - 1 : $ivs->dim(1); $ret{F_df} = pdl( $n_var, my $df1 = $y->dim(0) - $ivs->dim(1) ); $ret{F} = $ret{ss_model} / $n_var / ($ret{ss_residual} / $df1); $ret{F_p} = 1 - $ret{F}->gsl_cdf_fdist_P( $n_var, $df1 ) if $CDF; if (!$broadcasted) { my $se_b = ones( $coeff->dims? $coeff->dims : 1 ); $opt{CONST} and $se_b->slice(0) .= sqrt( $ret{ss_residual} / $df1 * $C->slice(0,0) ); # get the se for bs by successively regressing each iv by the rest ivs if ($ivs->dim(1) > 1) { my @coords = $opt{CONST} ? 1..$n_var : 0..$n_var-1; my $ones = !$opt{CONST} ? undef : ones($ivs->dim(0)); for my $k (@coords) { my $G = $ivs->dice_axis(1, [grep $_ != $k, @coords]); $G = $ones->glue( 1, $G ) if $opt{CONST}; my $b_G = $ivs->slice(':',$k)->ols( $G, {CONST=>0,PLOT=>0} ); my $ss_res_k = $ivs->slice(':',$k)->squeeze->sse( sumover($b_G * $G->transpose) ); $se_b->slice($k) .= sqrt( $ret{ss_residual} / $df1 / $ss_res_k ); } } else { $se_b->slice(0) .= sqrt( $ret{ss_residual} / $df1 / sum( $ivs->slice(':',0)**2 ) ); } $ret{b_se} = $se_b; $ret{b_t} = $coeff / $ret{b_se}; $ret{b_p} = 2 * ( 1 - $ret{b_t}->abs->gsl_cdf_tdist_P( $df1 ) ) if $CDF; } for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze }; $ret{b} = $coeff; return %ret; } =head2 r2_change =for ref Significance test for the incremental change in R2 when new variable(s) are added to an ols regression model. Returns the change stats as well as stats for both models. Based on L. (One way to make up for the lack of significance tests for coeffs in ols_t). =for options Default options (case insensitive): CONST => 1, =for usage Usage: # suppose these are two persons' ratings for top 10 box office movies # ascending sorted by box office pdl> p $y = qsort ceil(random(10, 2) * 5) [ [1 1 2 2 2 3 4 4 4 4] [1 2 2 3 3 3 4 4 5 5] ] # first IV is a simple linear trend pdl> p $x1 = sequence 10 [0 1 2 3 4 5 6 7 8 9] # the modeler wonders if adding a quadratic trend improves the fit pdl> p $x2 = sequence(10) ** 2 [0 1 4 9 16 25 36 49 64 81] # two difference models are given in two pdls # each as would be pass on to ols_t # the 1st model includes only linear trend # the 2nd model includes linear and quadratic trends # when necessary use dummy dim so both models have the same ndims pdl> %c = $y->r2_change( $x1->dummy(1), cat($x1, $x2) ) pdl> p "$_\t$c{$_}\n" for sort keys %c # person 1 person 2 F_change [0.72164948 0.071283096] # df same for both persons F_df [1 7] F_p [0.42370145 0.79717232] R2_change [0.0085966043 0.00048562549] model0 HASH(0x8c10828) model1 HASH(0x8c135c8) # the answer here is no. =cut *r2_change = \&PDL::r2_change; sub PDL::r2_change { my ($self, $ivs0, $ivs1, $opt) = @_; $ivs0->getndims == 1 and $ivs0 = $ivs0->dummy(1); my %ret; $ret{model0} = { $self->ols_t( $ivs0, $opt ) }; $ret{model1} = { $self->ols_t( $ivs1, $opt ) }; $ret{R2_change} = $ret{model1}->{R2} - $ret{model0}->{R2}; $ret{F_df} = pdl(my $df0 = $ivs1->dim(1) - $ivs0->dim(1), my $df1 = $ret{model1}->{F_df}->slice('(1)') ); $ret{F_change} = $ret{R2_change} * $df1 / ( (1-$ret{model1}->{R2}) * $df0 ); $ret{F_p} = 1 - $ret{F_change}->gsl_cdf_fdist_P( $df0, $df1 ) if $CDF; for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze }; %ret; } =head1 METHODS =head2 anova =for ref Analysis of variance. Uses type III sum of squares for unbalanced data. Dependent variable should be a 1D pdl. Independent variables can be passed as 1D perl array ref or 1D pdl. Will only calculate p-value (C) if there are more samples than the product of categories of all the IVs. Supports bad value (by ignoring missing or BAD values in dependent and independent variables list-wise). For more on ANOVA, see L. =for options Default options (case insensitive): V => 1, # carps if bad value in variables IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] PLOT => 0, # plots highest order effect # can set plot_means options here WIN => undef, # for plotting =for usage Usage: # suppose this is ratings for 12 apples pdl> p $y = qsort ceil( random(12)*5 ) [1 1 2 2 2 3 3 4 4 4 5 5] # IV for types of apple pdl> p $a = sequence(12) % 3 + 1 [1 2 3 1 2 3 1 2 3 1 2 3] # IV for whether we baked the apple pdl> @b = qw( y y y y y y n n n n n n ) pdl> %m = $y->anova( $a, \@b, { IVNM=>['apple', 'bake'] } ) pdl> p "$_\t@{[$m{$_} =~ /^\n*(.*?)\n*\z/s]}\n" for sort keys %m F 2.46666666666667 F_df [5 6] F_p 0.151168719948632 ms_model 3.08333333333333 ms_residual 1.25 ss_model 15.4166666666667 ss_residual 7.5 ss_total 22.9166666666667 | apple | F 0.466666666666667 | apple | F_p 0.648078345471096 | apple | df 2 | apple | m [2.75 3 3.5] | apple | ms 0.583333333333334 | apple | se [0.85391256 0.81649658 0.64549722] | apple | ss 1.16666666666667 | apple || err df 6 | apple ~ bake | F 0.0666666666666671 | apple ~ bake | F_p 0.936190104380701 | apple ~ bake | df 2 | apple ~ bake | m [ [1.5 2 2.5] [ 4 4 4.5] ] | apple ~ bake | ms 0.0833333333333339 | apple ~ bake | se [ [0.5 1 0.5] [ 1 1 0.5] ] | apple ~ bake | ss 0.166666666666668 | apple ~ bake || err df 6 | bake | F 11.2666666666667 | bake | F_p 0.015294126084452 | bake | df 1 | bake | m [2 4.1666667] | bake | ms 14.0833333333333 | bake | se [0.36514837 0.40138649] | bake | ss 14.0833333333333 | bake || err df 6 This is implemented as a call to L, with an C subjects vector. =cut *anova = \&PDL::anova; sub PDL::anova { my ($y, @args) = @_; anova_rptd($y, undef, @args); } sub _interactions { my ($ivs_ref, $idv) = @_; my (@inter, @idv_inter); for my $nway ( 2 .. @$ivs_ref ) { my $iter_idv = _combinations( $nway, [0..$#$ivs_ref] ); while ( my @v = &$iter_idv() ) { push @inter, interaction_code(@$ivs_ref[@v]); push @idv_inter, join ' ~ ', @$idv[@v]; } } (\@inter, \@idv_inter); } # now prepare for cell mean sub _interactions_cm { my ($ivs_ref, $pdl_ivs_raw) = @_; my ($dim0, @inter_cm, @inter_cmo) = $ivs_ref->[0]->dim(0); for my $nway ( 2 .. @$ivs_ref ) { my $iter_idv = _combinations( $nway, [0..$#$ivs_ref] ); while ( my @v = &$iter_idv() ) { my @i_cm; for my $o ( 0 .. $dim0 - 1 ) { push @i_cm, join '', map $_->slice("($o)"), @$pdl_ivs_raw[@v]; } my ($inter, $map) = effect_code( \@i_cm ); push @inter_cm, $inter; # get the order to put means in correct multi dim pdl pos # this is order in var_e dim(1) my @levels = sort { $map->{$a} <=> $map->{$b} } keys %$map; # this is order needed for cell mean my @i_cmo = sort { reverse($levels[$a]) cmp reverse($levels[$b]) } 0 .. $#levels; push @inter_cmo, pdl @i_cmo; } } (\@inter_cmo, \@inter_cm); } sub _cell_means { my ($data, $ivs_cm_ref, $i_cmo_ref, $idv, $pdl_ivs_raw) = @_; my %ind_id; @ind_id{ @$idv } = 0..$#$idv; my %cm; my $i = 0; for (@$ivs_cm_ref) { confess "_cell_means passed empty ivs_cm_ref ndarray at pos $i" if $_->isempty; my $last = zeroes $_->dim(0); my $i_neg = which $_->slice(':',0) == -1; $last->slice($i_neg) .= 1; $_->where($_ == -1) .= 0; $_ = $_->glue(1, $last); my @v = split ' ~ ', $idv->[$i]; my @shape = map $pdl_ivs_raw->[$_]->uniq->nelem, @ind_id{@v}; my ($m, $ss) = $data->centroid( $_ ); $m = $m->slice($i_cmo_ref->[$i])->sever; $ss = $ss->slice($i_cmo_ref->[$i])->sever; $m = $m->reshape(@shape); my $se = sqrt( ($ss/($_->sumover - 1)) / $_->sumover )->reshape(@shape); $cm{ "| $idv->[$i] | m" } = $m; $cm{ "| $idv->[$i] | se" } = $se; $i++; } \%cm; } # http://www.perlmonks.org/?node_id=371228 sub _combinations { my ($num, $arr) = @_; return sub { return } if $num == 0 or $num > @$arr; my @pick; return sub { return @$arr[ @pick = ( 0 .. $num - 1 ) ] unless @pick; my $i = $#pick; $i-- until $i < 0 or $pick[$i]++ < @$arr - $num + $i; return if $i < 0; @pick[$i .. $#pick] = $pick[$i] .. $#$arr; return @$arr[@pick]; }; } =head2 anova_rptd =for ref Repeated measures and mixed model anova. Uses type III sum of squares. The standard error (se) for the means are based on the relevant mean squared error from the anova, ie it is pooled across levels of the effect. Will only calculate p-value (C) if there are more samples than the product of categories of all the IVs. Uses L, so supports bad values. For more on repeated measures ANOVA, see L, and for mixed models, see L. =for options Default options (case insensitive): V => 1, # carps if bad value in dv IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] BTWN => [], # indices of between-subject IVs (matches IVNM indices) PLOT => 0, # plots highest order effect # see plot_means() for more options WIN => undef, # for plotting =for usage Usage: Some fictional data: recall_w_beer_and_wings.txt Subject Beer Wings Recall Alex 1 1 8 Alex 1 2 9 Alex 1 3 12 Alex 2 1 7 Alex 2 2 9 Alex 2 3 12 Brian 1 1 12 Brian 1 2 13 Brian 1 3 14 Brian 2 1 9 Brian 2 2 8 Brian 2 3 14 ... # rtable allows text only in 1st row and col my ($data, $idv, $subj) = rtable 'recall_w_beer_and_wings.txt'; my ($b, $w, $dv) = $data->dog; # subj and IVs can be 1d pdl or @ ref # subj must be the first argument my %m = $dv->anova_rptd( $subj, $b, $w, {ivnm=>['Beer', 'Wings']} ); print "$_\t@{[$m{$_} =~ /^\n*(.*?)\n*\z/s]}\n" for sort keys %m ss_residual 19.0833333333333 ss_subject 24.8333333333333 ss_total 133.833333333333 | Beer | F 9.39130434782609 | Beer | F_p 0.0547977008378944 | Beer | df 1 | Beer | m [10.916667 8.9166667] | Beer | ms 24 | Beer | se [0.4614791 0.4614791] | Beer | ss 24 | Beer || err df 3 | Beer || err ms 2.55555555555556 | Beer || err ss 7.66666666666667 | Beer ~ Wings | F 0.510917030567687 | Beer ~ Wings | F_p 0.623881438624431 | Beer ~ Wings | df 2 | Beer ~ Wings | m [ [ 10 7] [ 10.5 9.25] [12.25 10.5] ] | Beer ~ Wings | ms 1.625 | Beer ~ Wings | se [ [0.89170561 0.89170561] [0.89170561 0.89170561] [0.89170561 0.89170561] ] | Beer ~ Wings | ss 3.25000000000001 | Beer ~ Wings || err df 6 | Beer ~ Wings || err ms 3.18055555555555 | Beer ~ Wings || err ss 19.0833333333333 | Wings | F 4.52851711026616 | Wings | F_p 0.0632754786153548 | Wings | df 2 | Wings | m [8.5 9.875 11.375] | Wings | ms 16.5416666666667 | Wings | se [0.67571978 0.67571978 0.67571978] | Wings | ss 33.0833333333333 | Wings || err df 6 | Wings || err ms 3.65277777777778 | Wings || err ss 21.9166666666667 For mixed model anova, ie when there are between-subject IVs involved, feed the IVs as above, but specify in BTWN which IVs are between-subject. For example, if we had added age as a between-subject IV in the above example, we would do my %m = $dv->anova_rptd( $subj, $age, $b, $w, { ivnm=>['Age', 'Beer', 'Wings'], btwn=>[0] }); =cut *anova_rptd = \&PDL::anova_rptd; sub PDL::anova_rptd { my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef; my ($y, $subj, @ivs_raw) = @_; confess "Expected 1-D data, instead: ", $y->info if $y->ndims != 1; croak "Mismatched number of elements in DV and IV. Are you passing IVs the old-and-abandoned way?" if (ref $ivs_raw[0] eq 'ARRAY') and (@{ $ivs_raw[0] } != $y->nelem); for (@ivs_raw) { croak "too many dims in IV!" if ref $_ eq 'PDL' and $_->squeeze->ndims > 1 } my %opt = ( V => 1, # carps if bad value in dv IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] ( !defined($subj) ? () : ( BTWN => [], # indices of between-subject IVs (matches IVNM indices) )), PLOT => 0, # plots highest order effect WIN => undef, # for plotting ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } $opt{IVNM} = [ map { "IV_$_" } 0 .. $#ivs_raw ] if !$opt{IVNM} or !@{ $opt{IVNM} }; (my ($dsgn, $idv), $y, my ($sj, $ivs_ref_filtrd, $pdl_ivs_raw, $ivs_ref, $err_ref)) = $y->anova_design_matrix($subj, @ivs_raw, \%opt); confess "anova_rptd: got singular matrix for X' x X" if any +($dsgn->t x $dsgn)->det == 0; my $b_full = _ols_main(1, $y, $dsgn->t, {CONST=>0}); my %ret = (ss_total => $y->ss, ss_residual => $y->sse( sumover( $b_full * $dsgn ) )); if (defined $subj) { my @full = (@$ivs_ref, @$err_ref); my $has_btwn = @{ $opt{BTWN} }; my @is_btwn; $is_btwn[$_] = 1 for @{ $opt{BTWN} }; my @within_inds = 0 .. $#ivs_raw; @within_inds = grep !$is_btwn[$_], @within_inds if $has_btwn; my $within_df = pdl(map $_->dim(1), @full[@within_inds])->prodover->sclr; EFFECT: for my $k (0 .. $#full) { my $e = ($k > $#$ivs_ref)? '| err' : ''; my $i = ($k > $#$ivs_ref)? $k - @$ivs_ref : $k; my $i_pref = $k == $#full ? undef : "| $idv->[$i] |"; if (!defined $full[$k]) { # ss_residual as error $ret{ "$i_pref$e ss" } = $ret{ss_residual}; # highest ord inter for purely within design, (p-1)*(q-1)*(n-1) my $factor = (ref $full[-1] ? $full[-1] : $err_ref->[$full[-1]])->dim(1); my $df = $ret{ "$i_pref$e df" } = $factor * $within_df; die "${i_pref}residual df = 0" if $df <= 0; $ret{ "$i_pref$e ms" } = $ret{ "$i_pref$e ss" } / $df; } elsif (ref $full[$k]) { # unique error term next EFFECT unless my @G = grep $_ != $k && defined $full[$_], 0 .. $#full; my $G = ones($y->dim(0))->glue(1, grep ref $_, @full[@G]); my $b_G = $y->ols_t( $G, {CONST=>0} ); my $ss = $ret{$k == $#full ? 'ss_subject' : "$i_pref$e ss"} = $y->sse(sumover($b_G * $G->transpose)) - $ret{ss_residual}; if ($k != $#full) { my $df = $ret{"$i_pref$e df"} = $full[$k]->dim(1); die "residual df = 0" if $df <= 0; $ret{"$i_pref$e ms"} = $ss / $df; } } else { # repeating error term my $ss = $ret{$k == $#full ? 'ss_subject' : "$i_pref$e ss"} = $ret{"| $idv->[$full[$k]] |$e ss"}; if ($k != $#full) { my $df = $ret{"$i_pref$e df"} = $ret{"| $idv->[$full[$k]] |$e df"}; die "residual df = 0" if $df <= 0; $ret{"$i_pref$e ms"} = $ss / $df; } } } } else { $ret{ss_model} = $ret{ss_total} - $ret{ss_residual}; $ret{F_df} = pdl(my $F_df0 = $dsgn->dim(0) - 1, my $df1 = $y->nelem - $dsgn->dim(0)); $ret{ms_model} = $ret{ss_model} / $F_df0; $ret{ms_residual} = $ret{ss_residual} / $df1; $ret{F} = $ret{ms_model} / $ret{ms_residual}; $ret{F_p} = 1 - $ret{F}->gsl_cdf_fdist_P( $F_df0, $df1 ) if $CDF and $df1 > 0; # get IV ss from $ivs_ref instead of $dsgn pdl my $ones = ones($y->dim(0)); for my $k (0 .. $#$ivs_ref) { my $G = $ones->glue(1, @$ivs_ref[grep $_ != $k, 0 .. $#$ivs_ref]); my $b_G = $y->ols_t( $G, {CONST=>0} ); $ret{ "| $idv->[$k] | ss" } = $y->sse( sumover($b_G * $G->transpose) ) - $ret{ss_residual}; my $df0 = $ret{ "| $idv->[$k] | df" } = $ivs_ref->[$k]->dim(1); $ret{ "| $idv->[$k] || err df" } = $df1; die "residual df = 0" if $df1 <= 0; $ret{ "| $idv->[$k] | ms" } = $ret{ "| $idv->[$k] | ss" } / $df0; } } # have all iv, inter, and error effects. get F and F_p for (0 .. $#$ivs_ref) { my $ms_residual = defined $subj ? $ret{ "| $idv->[$_] || err ms" } : $ret{ms_residual}; my ($df0, $df1) = @ret{"| $idv->[$_] | df" , "| $idv->[$_] || err df"}; my $F = $ret{ "| $idv->[$_] | F" } = $ret{ "| $idv->[$_] | ms" } / $ms_residual; $ret{ "| $idv->[$_] | F_p" } = 1 - $F->gsl_cdf_fdist_P($df0, $df1) if $CDF and $df1 > 0; } for (keys %ret) {ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze}; my ($inter_cmo_ref, $inter_cm_ref) = _interactions_cm($ivs_ref_filtrd, $pdl_ivs_raw); # append inter info to cell means effects my $ivs_cm_ref = [@$ivs_ref_filtrd, @$inter_cm_ref]; my @i_cmo_ref = map pdl(values %{ (effect_code($_))[1] })->qsort, @$pdl_ivs_raw; #line 1068 "lib/PDL/Stats/GLM.pd" push @i_cmo_ref, @$inter_cmo_ref; my $cm_ref = _cell_means( $y, $ivs_cm_ref, \@i_cmo_ref, $idv, $pdl_ivs_raw ); if (defined $subj) { my @ls = map { $_->uniq->nelem } @$pdl_ivs_raw; $cm_ref = _fix_rptd_se( $cm_ref, \%ret, $opt{IVNM}, \@ls, $sj->uniq->nelem ); } # integrate mean and se into %ret @ret{ keys %$cm_ref } = values %$cm_ref; my $highest = join(' ~ ', @{ $opt{IVNM} }); $cm_ref->{"| $highest | m"}->plot_means( $cm_ref->{"| $highest | se"}, { %opt, IVNM=>$idv } ) if $opt{PLOT}; %ret; } =head2 anova_design_matrix =for ref Effect-coded design matrix for anova, including repeated-measures and mixed-model. The C for use in linear regression i.e. C. Added in 0.854. See L for more. Supports bad value in the dependent and independent variables. It automatically removes bad data listwise, i.e. remove a subject's data if there is any cell missing for the subject. =for options Default options (case insensitive): V => 1, # carps if bad value in dv IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] BTWN => [], # indices of between-subject IVs (matches IVNM indices) =for usage $matrix = $dv->anova_design_matrix(undef, $b, $w, {ivnm=>[qw(b w)]}); $matrix = $dv->anova_design_matrix( $subj, $b, $w, {ivnm=>[qw(b w)]}); # repeated-measures $matrix = $dv->anova_design_matrix( $subj, $b, $w, {ivnm=>[qw(b w)], btwn=>['b']}); # mixed-model ($matrix, $ivnm_ref) = $dv->anova_design_matrix( $subj, $b, $w, {ivnm=>[qw(b w)], btwn=>['b']}); # list context also names =cut *anova_design_matrix = \&PDL::anova_design_matrix; sub PDL::anova_design_matrix { my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef; my ($y, $subj, @ivs_raw) = @_; confess "No IVs: did you omit 'undef' for anova?" if !@ivs_raw; confess "Expected 1-D data, instead: ", $y->info if $y->ndims != 1; for (@ivs_raw) { croak "too many dims in IV!" if ref $_ eq 'PDL' and $_->squeeze->ndims > 1; } my %opt = ( V => 1, # carps if bad value in dv IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] ( !defined($subj) ? () : ( BTWN => [], # indices of between-subject IVs (matches IVNM indices) )), ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } $opt{IVNM} = [ map { "IV_$_" } 0 .. $#ivs_raw ] if !$opt{IVNM} or !@{ $opt{IVNM} }; my @idv_orig = @{ $opt{IVNM} }; my @pdl_ivs_raw = map scalar PDL::Stats::Basic::code_ivs($_), @ivs_raw; my $pdl_ivs = pdl(\@pdl_ivs_raw); # explicit set badflag because pdl() removes badflag $pdl_ivs->badflag( scalar grep $_->badflag, @pdl_ivs_raw ); my $sj; if (defined($subj)) { # delete bad data listwise ie remove subj if any cell missing $sj = PDL::Stats::Basic::code_ivs($subj); my $ibad = which( $y->isbad | nbadover($pdl_ivs->transpose) ); my $sj_bad = $sj->slice($ibad)->uniq; if ($sj_bad->nelem) { warn $sj_bad->nelem . " subjects with missing data removed\n" if $opt{V}; $sj = $sj->setvaltobad($_) for (list $sj_bad); my $igood = which $sj->isgood; for ($y, $sj, @pdl_ivs_raw) { $_ = $_->slice( $igood )->sever; $_->badflag(0); } } } else { ($y, $pdl_ivs) = _rm_bad_value( $y, $pdl_ivs ); if ($opt{V} and $y->nelem < $pdl_ivs_raw[0]->nelem) { warn sprintf "%d subjects with missing data removed\n", $pdl_ivs_raw[0]->nelem - $y->nelem; } @pdl_ivs_raw = $pdl_ivs->dog; } my @ivs_ref_fltrd = map scalar effect_code($_), @pdl_ivs_raw; my ($ivs_inter_ref, $idv_inter) = _interactions(\@ivs_ref_fltrd, \@idv_orig); # append inter info to main effects my $ivs_ref = [@ivs_ref_fltrd, @$ivs_inter_ref]; my @idv = (@idv_orig, @$idv_inter); # matches $ivs_ref, with an extra last pdl for subj effect my $err_ref = !defined($subj) ? [] : _add_errors( $sj, $ivs_ref, \@idv, \@pdl_ivs_raw, $opt{BTWN} ); for (grep ref $err_ref->[$_], 0..$#$err_ref) { my ($null_row_ids, $non_null_row_ids) = $err_ref->[$_]->zcover->which_both; confess "got null columns $null_row_ids in error entry #$_ ($idv[$_])" if !$null_row_ids->isempty; } my $dsgn = PDL::glue(1, ones($y->dim(0)), @$ivs_ref, (grep ref($_), @$err_ref))->t; !wantarray ? $dsgn : ($dsgn, \@idv, $y, $sj, \@ivs_ref_fltrd, \@pdl_ivs_raw, $ivs_ref, $err_ref); } # code (btwn group) subjects. Rutherford (2011) pp 208-209 sub _code_btwn { my ($subj, $btwn) = @_; my (@grp, %grp_s); for my $n (0 .. $subj->nelem - 1) { # construct string to code group membership # something not treated as BAD by code_ivs to start off marking group membership # if no $btwn, everyone ends up in the same grp my $s = '_' . join '', map $_->slice($n), @$btwn; push @grp, $s; # group membership $s .= $subj->slice($n); # keep track of total uniq subj $grp_s{$s} = 1; } my $grp = PDL::Stats::Kmeans::iv_cluster \@grp; my $spdl = zeroes $subj->dim(0), keys(%grp_s) - $grp->dim(1); my $d1 = 0; for my $g (0 .. $grp->dim(1)-1) { my $col_inds = which $grp->slice(':',$g); my $gsub = $subj->slice( $col_inds )->effect_code; my ($nobs, $nsub) = $gsub->dims; $spdl->slice($col_inds, [$d1,$d1+$nsub-1]) .= $gsub; $d1 += $nsub; } $spdl; } sub _add_errors { my ($subj, $ivs_ref, $idv, $raw_ivs, $btwn) = @_; my $spdl = _code_btwn($subj, [@$raw_ivs[@$btwn]]); # if btwn factor involved, or highest order inter for within factors # elem is undef, so that # @errors ind matches @$ivs_ref, with an extra elem at the end for subj # mark btwn factors for error terms # same error term for B(wn) and A(btwn) x B(wn) (Rutherford, p205) my %is_btwn = map +($_=>1), @$idv[ @$btwn ]; my $has_btwn = keys %is_btwn; my %idv2indx = map +($idv->[$_]=>$_), 0..$#$idv; my $ie_subj; my @errors = map { my @fs = split ' ~ ', $idv->[$_]; # separate bw and wn factors # if only bw, error is bw x subj # if only wn or wn and bw, error is wn x subj my @bw = !$has_btwn ? () : grep $is_btwn{$_}, @fs; my @wn = !$has_btwn ? @fs : grep !$is_btwn{$_}, @fs; $ie_subj = $_ if !defined($ie_subj) and !@wn; my $err = join ' ~ ', @wn ? @wn : @bw; # highest order inter of within factors, use ss_residual as error if ( @wn == @$raw_ivs - @$btwn ) { undef } # repeating btwn factors use ss_subject as error elsif (!@wn and $_ > $ie_subj) { $ie_subj } # repeating error term elsif ($_ > $idv2indx{$err}) { $idv2indx{$err} } elsif (@wn) { interaction_code($ivs_ref->[$_], $spdl) } else { $spdl } } 0 .. $#$ivs_ref; push @errors, $has_btwn ? $ie_subj : $spdl; \@errors; } sub _fix_rptd_se { # if ivnm lvls_ref for within ss only this can work for mixed design my ($cm_ref, $ret, $ivnm, $lvls_ref, $n) = @_; my @se = map /^\| (.+?) \| se$/ ? $1 : (), keys %$cm_ref; my @n_obs = map { my @ivs = split / ~ /, $_; my $i_ivs = which_id $ivnm, \@ivs; my $icollapsed = setops pdl(0 .. $#$ivnm), 'XOR', $i_ivs; my $collapsed = $icollapsed->nelem? pdl( @$lvls_ref[(list $icollapsed)] )->prodover : 1 ; $n * $collapsed; } @se; for my $i (0 .. $#se) { $cm_ref->{"| $se[$i] | se"} .= sqrt( $ret->{"| $se[$i] || err ms"} / $n_obs[$i] ); } $cm_ref; } =head2 dummy_code =for ref Dummy coding of nominal variable (perl @ ref or 1d pdl) for use in regression. Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD). =for usage pdl> @a = qw(a a a b b b c c c) pdl> p $a = dummy_code(\@a) [ [1 1 1 0 0 0 0 0 0] [0 0 0 1 1 1 0 0 0] ] =cut *dummy_code = \&PDL::dummy_code; sub PDL::dummy_code { my ($var_ref) = @_; my $var_e = effect_code( $var_ref ); $var_e->where( $var_e == -1 ) .= 0; $var_e; } =head2 effect_code =for ref Unweighted effect coding of nominal variable (perl @ ref or 1d pdl) for use in regression. returns in @ context coded pdl and % ref to level - pdl->dim(1) index. Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD). =for usage my @var = qw( a a a b b b c c c ); my ($var_e, $map) = effect_code( \@var ); print $var_e . $var_e->info . "\n"; [ [ 1 1 1 0 0 0 -1 -1 -1] [ 0 0 0 1 1 1 -1 -1 -1] ] PDL: Double D [9,2] print "$_\t$map->{$_}\n" for sort keys %$map a 0 b 1 c 2 =cut *effect_code = \&PDL::effect_code; sub PDL::effect_code { my ($var_ref) = @_; my ($var, $map_ref) = PDL::Stats::Basic::code_ivs( $var_ref ); my $var_max = $var->max; confess "effect_code called with only one unique value" if $var_max < 1; my $var_e = yvals( float, $var->nelem, $var_max ) == $var; $var_e->slice(which( $var == $var_max ), ) .= -1; $var_e = $var_e->setbadif( $var->isbad ) if $var->badflag; wantarray ? ($var_e, $map_ref) : $var_e; } =head2 effect_code_w =for ref Weighted effect code for nominal variable. returns in @ context coded pdl and % ref to level - pdl->dim(1) index. Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD). =for usage pdl> @a = qw( a a b b b c c ) pdl> p $a = effect_code_w(\@a) [ [ 1 1 0 0 0 -1 -1] [ 0 0 1 1 1 -1.5 -1.5] ] =cut *effect_code_w = \&PDL::effect_code_w; sub PDL::effect_code_w { my ($var_ref) = @_; my ($var_e, $map_ref) = effect_code( $var_ref ); return wantarray ? ($var_e, $map_ref) : $var_e if $var_e->sum == 0; my $pos = $var_e == 1; my $neg = $var_e == -1; my $w = $pos->sumover / $neg->sumover; my $neg_ind = $neg->whichND; $var_e->indexND($neg_ind) *= $w->slice($neg_ind->slice('(1)')); wantarray ? ($var_e, $map_ref) : $var_e; } =head2 interaction_code Returns the coded interaction term for effect-coded variables. Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD). =for usage pdl> $a = sequence(6) > 2 pdl> p $a = $a->effect_code [ [ 1 1 1 -1 -1 -1] ] pdl> $b = pdl( qw( 0 1 2 0 1 2 ) ) pdl> p $b = $b->effect_code [ [ 1 0 -1 1 0 -1] [ 0 1 -1 0 1 -1] ] pdl> p $ab = interaction_code( $a, $b ) [ [ 1 0 -1 -1 -0 1] [ 0 1 -1 -0 -1 1] ] =cut *interaction_code = \&PDL::interaction_code; sub PDL::interaction_code { my $i = ones( $_[0]->dim(0), 1 ); $i = ($i * $_->dummy(1))->clump(1,2) for @_; $i; } =head2 ols =for ref Ordinary least squares regression, aka linear regression. Unlike B, ols is not broadcastable, but it can handle bad value (by ignoring observations with bad value in dependent or independent variables list-wise) and returns the full model in list context with various stats. IVs ($x) should be pdl dims $y->nelem or $y->nelem x n_iv. Do not supply the constant vector in $x. Intercept is automatically added and returned as FIRST of the coeffs if CONST=>1. Returns full model in list context and coeff in scalar context. For more on multiple linear regression see L. =for options Default options (case insensitive): CONST => 1, PLOT => 0, # see plot_residuals() for plot options WIN => undef, # for plotting =for usage Usage: # suppose this is a person's ratings for top 10 box office movies # ascending sorted by box office pdl> $y = pdl '[1 1 2 2 2 2 4 4 5 5]' # construct IV with linear and quadratic component pdl> p $x = cat sequence(10), sequence(10)**2 [ [ 0 1 2 3 4 5 6 7 8 9] [ 0 1 4 9 16 25 36 49 64 81] ] pdl> %m = $y->ols( $x ) pdl> p "$_\t@{[$m{$_} =~ /^\n*(.*?)\n*\z/s]}\n" for sort keys %m F 40.4225352112676 F_df [2 7] F_p 0.000142834216344756 R2 0.920314253647587 # coeff constant linear quadratic b [0.981818 0.212121 0.030303] b_p [0.039910 0.328001 0.203034] b_se [0.389875 0.201746 0.021579] b_t [2.518284 1.051422 1.404218] ss_model 19.8787878787879 ss_residual 1.72121212121212 ss_total 21.6 y_pred [0.98181818 1.2242424 1.5272727 ... 4.6181818 5.3454545] =cut *ols = \&PDL::ols; sub PDL::ols { _ols_common(0, @_); } # ivs = [nobs x nivs] so can `dog` retval sub _rm_bad_value { my ($y, $ivs) = @_; return ($y, $ivs, undef) if !$y->check_badflag and !$ivs->check_badflag; my $idx = which($y->isgood & (nbadover ($ivs->transpose)==0)); $_ = $_->slice($idx)->sever for $y, $ivs; $_->badflag(0) for $y, $ivs; ($y, $ivs, $idx); } =head2 ols_rptd =for ref Repeated measures linear regression. Handles purely within-subject design for now. (Lorch & Myers, 1990; Van den Noortgate & Onghena, 2006). See F for an example using the Lorch and Myers data. =for usage Usage: # This is the example from Lorch and Myers (1990), # a study on how characteristics of sentences affected reading time # Three within-subject IVs: # SP -- serial position of sentence # WORDS -- number of words in sentence # NEW -- number of new arguments in sentence # $subj can be 1D pdl or @ ref and must be the first argument # IV can be 1D @ ref or pdl # 1D @ ref is effect coded internally into pdl # pdl is left as is my %r = $rt->ols_rptd( $subj, $sp, $words, $new ); print "$_\t$r{$_}\n" for sort keys %r; ss_residual 58.3754646504336 ss_subject 51.8590337714286 ss_total 405.188241771429 # SP WORDS NEW F [ 7.208473 61.354153 1.0243311] F_p [0.025006181 2.619081e-05 0.33792837] coeff [0.33337285 0.45858933 0.15162986] df [1 1 1] df_err [9 9 9] ms [ 18.450705 73.813294 0.57026483] ms_err [ 2.5595857 1.2030692 0.55671923] ss [ 18.450705 73.813294 0.57026483] ss_err [ 23.036272 10.827623 5.0104731] =cut *ols_rptd = \&PDL::ols_rptd; sub PDL::ols_rptd { my ($y, $subj, @ivs_raw) = @_; $y = $y->squeeze; $y->getndims > 1 and croak "ols_rptd does not support broadcasting"; my @ivs = map { (ref $_ eq 'PDL' and $_->ndims > 1)? $_ : ref $_ eq 'PDL' ? $_->dummy(1) : scalar effect_code($_) ; } @ivs_raw; my %r; $r{ss_total} = $y->ss; # STEP 1: subj my $s = effect_code $subj; my $b_s = $y->ols_t($s); my $pred = sumover($b_s->slice('1:-1') * $s->transpose) + $b_s->slice(0); $r{ss_subject} = $r{ss_total} - $y->sse( $pred ); # STEP 2: add predictor variables my $iv_p = $s->glue(1, @ivs); my $b_p = $y->ols_t($iv_p); # only care about coeff for predictor vars. no subj or const coeff $r{coeff} = $b_p->slice([-@ivs,-1])->sever; # get total sse for this step $pred = sumover($b_p->slice('1:-1') * $iv_p->transpose) + $b_p->slice(0); my $ss_pe = $y->sse( $pred ); # get predictor ss by successively reducing the model $r{ss} = zeroes scalar(@ivs); for my $i (0 .. $#ivs) { my $iv = $s->glue(1, @ivs[ grep $_ != $i, 0..$#ivs ]); my $b = $y->ols_t($iv); $pred = sumover($b->slice('1:-1') * $iv->transpose) + $b->slice(0); $r{ss}->slice($i) .= $y->sse($pred) - $ss_pe; } # STEP 3: get predictor x subj interaction as error term my $iv_e = PDL::glue 1, map interaction_code( $s, $_ ), @ivs; # get total sse for this step. full model now. my $b_f = $y->ols_t( $iv_p->glue(1,$iv_e) ); $pred = sumover($b_f->slice('1:-1') * $iv_p->glue(1,$iv_e)->transpose) + $b_f->slice(0); $r{ss_residual} = $y->sse( $pred ); # get predictor x subj ss by successively reducing the error term $r{ss_err} = zeroes scalar(@ivs); for my $i (0 .. $#ivs) { my $iv = $iv_p->glue(1, map interaction_code($s, $_), @ivs[grep $_ != $i, 0..$#ivs]); my $b = $y->ols_t($iv); my $pred = sumover($b->slice('1:-1') * $iv->transpose) + $b->slice(0); $r{ss_err}->slice($i) .= $y->sse($pred) - $r{ss_residual}; } # Finally, get MS, F, etc $r{df} = pdl( map $_->squeeze->ndims, @ivs ); $r{ms} = $r{ss} / $r{df}; $r{df_err} = $s->dim(1) * $r{df}; $r{ms_err} = $r{ss_err} / $r{df_err}; $r{F} = $r{ms} / $r{ms_err}; $r{F_p} = 1 - $r{F}->gsl_cdf_fdist_P( $r{df}, $r{df_err} ) if $CDF; %r; } =head2 logistic =for ref Logistic regression with maximum likelihood estimation using L. IVs ($x) should be pdl dims $y->nelem or $y->nelem x n_iv. Do not supply the constant vector in $x. It is included in the model and returned as LAST of coeff. Returns full model in list context and coeff in scalar context. The significance tests are likelihood ratio tests (-2LL deviance) tests. IV significance is tested by comparing deviances between the reduced model (ie with the IV in question removed) and the full model. ***NOTE: the results here are qualitatively similar to but not identical with results from R, because different algorithms are used for the nonlinear parameter fit. Use with discretion*** =for options Default options (case insensitive): INITP => zeroes( $x->dim(1) + 1 ), # n_iv + 1 MAXIT => 1000, EPS => 1e-7, =for usage Usage: # suppose this is whether a person had rented 10 movies pdl> p $y = ushort( random(10)*2 ) [0 0 0 1 1 0 0 1 1 1] # IV 1 is box office ranking pdl> p $x1 = sequence(10) [0 1 2 3 4 5 6 7 8 9] # IV 2 is whether the movie is action- or chick-flick pdl> p $x2 = sequence(10) % 2 [0 1 0 1 0 1 0 1 0 1] # concatenate the IVs together pdl> p $x = cat $x1, $x2 [ [0 1 2 3 4 5 6 7 8 9] [0 1 0 1 0 1 0 1 0 1] ] pdl> %m = $y->logistic( $x ) pdl> p "$_\t$m{$_}\n" for sort keys %m D0 13.8629436111989 Dm 9.8627829791575 Dm_chisq 4.00016063204141 Dm_df 2 Dm_p 0.135324414081692 # ranking genre constant b [0.41127706 0.53876358 -2.1201285] b_chisq [ 3.5974504 0.16835559 2.8577151] b_p [0.057868258 0.6815774 0.090936587] iter 12 y_pred [0.10715577 0.23683909 ... 0.76316091 0.89284423] # to get the covariance out, supply a true value for the COV option: pdl> %m = $y->logistic( $x, {COV=>1} ) pdl> p $m{cov}; =cut *logistic = \&PDL::logistic; sub PDL::logistic { require PDL::Fit::LM; my ( $self, $ivs, $opt ) = @_; $self = $self->squeeze; # make compatible w multiple var cases $ivs->getndims == 1 and $ivs = $ivs->dummy(1); $self->dim(0) != $ivs->dim(0) and carp "mismatched n btwn DV and IV!"; my %opt = ( INITP => zeroes( $ivs->dim(1) + 1 ), # n_ivs + 1 MAXIT => 1000, EPS => 1e-7, ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } # not using it atm $opt{WT} = 1; # Use lmfit. Fourth input argument is reference to user-defined # copy INITP so we have the original value when needed my ($yfit,$coeff,$cov,$iter) = PDL::Fit::LM::lmfit($ivs, $self, $opt{WT}, \&_logistic, $opt{INITP}->copy, { Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } ); # apparently at least coeff is child of some pdl # which is changed in later lmfit calls $yfit = $yfit->copy; $coeff = $coeff->copy; return $coeff unless wantarray; my %ret; my $n0 = $self->where($self == 0)->nelem; my $n1 = $self->nelem - $n0; $ret{cov} = $cov if $opt{COV}; $ret{D0} = -2*($n0 * log($n0 / $self->nelem) + $n1 * log($n1 / $self->nelem)); $ret{Dm} = sum( $self->dvrs( $yfit ) ** 2 ); $ret{Dm_chisq} = $ret{D0} - $ret{Dm}; $ret{Dm_df} = $ivs->dim(1); $ret{Dm_p} = 1 - PDL::GSL::CDF::gsl_cdf_chisq_P( $ret{Dm_chisq}, $ret{Dm_df} ) if $CDF; my $coeff_chisq = zeroes $opt{INITP}->nelem; if ( $ivs->dim(1) > 1 ) { for my $k (0 .. $ivs->dim(1)-1) { my @G = grep { $_ != $k } (0 .. $ivs->dim(1)-1); my $G = $ivs->dice_axis(1, \@G); my $init = $opt{INITP}->dice([ @G, $opt{INITP}->dim(0)-1 ])->copy; my $y_G = PDL::Fit::LM::lmfit( $G, $self, $opt{WT}, \&_logistic, $init, { Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } ); $coeff_chisq->slice($k) .= $self->dm( $y_G ) - $ret{Dm}; } } else { # d0 is, by definition, the deviance with only intercept $coeff_chisq->slice(0) .= $ret{D0} - $ret{Dm}; } my $y_c = PDL::Fit::LM::lmfit( $ivs, $self, $opt{WT}, \&_logistic_no_intercept, $opt{INITP}->slice('0:-2')->sever, { Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } ); $coeff_chisq->slice(-1) .= $self->dm( $y_c ) - $ret{Dm}; $ret{b} = $coeff; $ret{b_chisq} = $coeff_chisq; $ret{b_p} = 1 - $ret{b_chisq}->gsl_cdf_chisq_P( 1 ) if $CDF; $ret{y_pred} = $yfit; $ret{iter} = $iter; for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze }; %ret; } sub _logistic { my ($x,$par,$ym,$dyda) = @_; # $b and $c are fit parameters slope and intercept my $b = $par->slice([0,$x->dim(1) - 1])->sever; my $c = $par->slice(-1)->sever; # Write function with dependent variable $ym, # independent variable $x, and fit parameters as specified above. # Use the .= (dot equals) assignment operator to express the equality # (not just a plain equals) $ym .= 1 / ( 1 + exp( -1 * (sumover($b * $x->transpose) + $c) ) ); my @dy = map $dyda->slice(",($_)"), 0 .. $par->dim(0)-1; # Partial derivative of the function with respect to each slope # fit parameter ($b in this case). Again, note .= assignment # operator (not just "equals") $dy[$_] .= $x->slice(':',$_) * $ym * (1 - $ym) for (0 .. $b->dim(0)-1); # Partial derivative of the function re intercept par $dy[-1] .= $ym * (1 - $ym); } sub _logistic_no_intercept { my ($x,$par,$ym,$dyda) = @_; my $b = $par->slice([0,$x->dim(1) - 1])->sever; # Write function with dependent variable $ym, # independent variable $x, and fit parameters as specified above. # Use the .= (dot equals) assignment operator to express the equality # (not just a plain equals) $ym .= 1 / ( 1 + exp( -1 * sumover($b * $x->transpose) ) ); my (@dy) = map {$dyda -> slice(",($_)") } (0 .. $par->dim(0)-1); # Partial derivative of the function with respect to each slope # fit parameter ($b in this case). Again, note .= assignment # operator (not just "equals") $dy[$_] .= $x->slice(':',$_) * $ym * (1 - $ym) for 0 .. $b->dim(0)-1; } =head2 pca =for ref Principal component analysis. Based on corr instead of cov. Bad values are ignored pair-wise. OK when bad values are few but otherwise probably should fill_m etc before pca). Uses L. =for options Default options (case insensitive): CORR => 1, # boolean. use correlation or covariance PLOT => 0, # calls plot_screes by default # can set plot_screes options here WIN => undef, # for plotting =for usage Usage: my $d = qsort random 10, 5; # 10 obs on 5 variables my %r = $d->pca( \%opt ); print "$_\t$r{$_}\n" for (keys %r); eigenvalue # variance accounted for by each component [4.70192 0.199604 0.0471421 0.0372981 0.0140346] eigenvector # dim var x comp. weights for mapping variables to component [ [ -0.451251 -0.440696 -0.457628 -0.451491 -0.434618] [ -0.274551 0.582455 0.131494 0.255261 -0.709168] [ 0.43282 0.500662 -0.139209 -0.735144 -0.0467834] [ 0.693634 -0.428171 0.125114 0.128145 -0.550879] [ 0.229202 0.180393 -0.859217 0.4173 0.0503155] ] loadings # dim var x comp. correlation between variable and component [ [ -0.978489 -0.955601 -0.992316 -0.97901 -0.942421] [ -0.122661 0.260224 0.0587476 0.114043 -0.316836] [ 0.0939749 0.108705 -0.0302253 -0.159616 -0.0101577] [ 0.13396 -0.0826915 0.0241629 0.0247483 -0.10639] [ 0.027153 0.0213708 -0.101789 0.0494365 0.00596076] ] pct_var # percent variance accounted for by each component [0.940384 0.0399209 0.00942842 0.00745963 0.00280691] Plot scores along the first two components, $d->plot_scores( $r{eigenvector} ); =cut *pca = \&PDL::pca; sub PDL::pca { my ($self, $opt) = @_; my %opt = ( CORR => 1, PLOT => 0, WIN => undef, # for plotting ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } my $var_var = $opt{CORR}? $self->corr_table : $self->cov_table; # value is axis pdl and score is var x axis my ($eigvec, $eigval) = $var_var->eigens_sym; $eigvec = $eigvec->transpose; # compatibility with PDL::Slatec::eigsys # ind is sticky point for broadcasting my $ind_sorted = $eigval->qsorti->slice('-1:0'); $eigvec = $eigvec->slice(':',$ind_sorted)->sever; $eigval = $eigval->slice($ind_sorted)->sever; # var x axis my $var = $eigval / $eigval->sum->sclr; my $loadings; if ($opt{CORR}) { $loadings = $eigvec * sqrt( $eigval->transpose ); } else { my $scores = $eigvec x $self->dev_m; $loadings = $self->corr( $scores->dummy(1) ); } $var->plot_screes(\%opt) if $opt{PLOT}; ( eigenvalue=>$eigval, eigenvector=>$eigvec, pct_var=>$var, loadings=>$loadings ); } =head2 pca_sorti Determine by which vars a component is best represented. Descending sort vars by size of association with that component. Returns sorted var and relevant component indices. =for options Default options (case insensitive): NCOMP => 10, # maximum number of components to consider =for usage Usage: # let's see if we replicated the Osgood et al. (1957) study pdl> ($data, $idv, $ido) = rtable 'osgood_exp.csv', {v=>0} # select a subset of var to do pca pdl> $ind = which_id $idv, [qw( ACTIVE BASS BRIGHT CALM FAST GOOD HAPPY HARD LARGE HEAVY )] pdl> $data = $data( ,$ind)->sever pdl> @$idv = @$idv[list $ind] pdl> %m = $data->pca pdl> ($iv, $ic) = $m{loadings}->pca_sorti() pdl> p "$idv->[$_]\t" . $m{loadings}->($_,$ic)->flat . "\n" for (list $iv) # COMP0 COMP1 COMP2 COMP3 HAPPY [0.860191 0.364911 0.174372 -0.10484] GOOD [0.848694 0.303652 0.198378 -0.115177] CALM [0.821177 -0.130542 0.396215 -0.125368] BRIGHT [0.78303 0.232808 -0.0534081 -0.0528796] HEAVY [-0.623036 0.454826 0.50447 0.073007] HARD [-0.679179 0.0505568 0.384467 0.165608] ACTIVE [-0.161098 0.760778 -0.44893 -0.0888592] FAST [-0.196042 0.71479 -0.471355 0.00460276] LARGE [-0.241994 0.594644 0.634703 -0.00618055] BASS [-0.621213 -0.124918 0.0605367 -0.765184] =cut *pca_sorti = \&PDL::pca_sorti; sub PDL::pca_sorti { # $self is pdl (var x component) my ($self, $opt) = @_; my %opt = ( NCOMP => 10, # maximum number of components to consider ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } my $ncomp = pdl($opt{NCOMP}, $self->dim(1))->min; $self = $self->dice_axis( 1, pdl(0..$ncomp-1) ); my $icomp = $self->transpose->abs->maximum_ind; # sort between comp my $ivar_sort = $icomp->qsorti; $self = $self->slice($ivar_sort)->sever; # sort within comp my $ic = $icomp->slice($ivar_sort)->iv_cluster; for my $comp (0 .. $ic->dim(1)-1) { my $i = $self->slice(which($ic->slice(':',$comp)), "($comp)")->qsorti->slice('-1:0'); $ivar_sort->slice(which $ic->slice(':',$comp)) .= $ivar_sort->slice(which $ic->slice(':',$comp))->slice($i); } wantarray ? ($ivar_sort, pdl(0 .. $ic->dim(1)-1)) : $ivar_sort; } =head2 plot_means Plots means anova style. Can handle up to 4-way interactions (ie 4D pdl). =for options Default options (case insensitive): IVNM => ['IV_0', 'IV_1', 'IV_2', 'IV_3'], DVNM => 'DV', AUTO => 1, # auto set dims to be on x-axis, line, panel # if set 0, dim 0 goes on x-axis, dim 1 as lines # dim 2+ as panels # see PDL::Graphics::Simple for next option WIN => undef, # pgswin object. not closed here if passed # allows comparing multiple lines in same plot SIZE => 640, # individual square panel size in pixels =for usage Usage: # see anova for mean / se pdl structure $mean->plot_means( $se, {IVNM=>['apple', 'bake']} ); Or like this: $m{'| apple ~ bake | m'}->plot_means; =cut *plot_means = \&PDL::plot_means; sub PDL::plot_means { require PDL::Graphics::Simple; my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef; my ($self, $se) = @_; $self = $self->squeeze; if ($self->ndims > 4) { carp "Data is > 4D. No plot here."; return; } my %opt = ( IVNM => ['IV_0', 'IV_1', 'IV_2', 'IV_3'], DVNM => 'DV', AUTO => 1, # auto set vars to be on X axis, line, panel WIN => undef, # PDL::Graphics::Simple object SIZE => 640, # individual square panel size in pixels ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt } # decide which vars to plot as x axis, lines, panels # put var w most levels on x axis # put var w least levels on diff panels my @iD = 0..3; my @dims = (1, 1, 1, 1); # splice ARRAY,OFFSET,LENGTH,LIST splice @dims, 0, $self->ndims, $self->dims; $self = $self->reshape(@dims)->sever; $se = $se->reshape(@dims)->sever if defined $se; @iD = reverse list qsorti pdl @dims if $opt{AUTO}; # $iD[0] on x axis # $iD[1] as separate lines my $nx = $self->dim($iD[2]); # n xpanels my $ny = $self->dim($iD[3]); # n ypanels my $w = $opt{WIN} || PDL::Graphics::Simple::pgswin( size=>[$opt{SIZE}*$nx, $opt{SIZE}*$ny,'px']); my $seq0 = sequence(my $dim0 = $self->dim($iD[0])); my ($pcount, @plots) = 0; for my $y (0..$ny-1) { for my $x (0..$nx-1) { my $key_prefix = "$opt{IVNM}[$iD[0]]|"; $key_prefix .= $opt{IVNM}[$iD[2]] . "=$x|" if $nx > 1; $key_prefix .= $opt{IVNM}[$iD[3]] . "=$y|" if $ny > 1; for (0 .. $self->dim($iD[1]) - 1) { my $ke = "$key_prefix$opt{IVNM}[$iD[1]]=$_"; my $ydiced = $self->dice_axis($iD[3],$y)->dice_axis($iD[2],$x)->dice_axis($iD[1],$_)->squeeze; push @plots, with=>'lines', ke=>"$ke mean", style=>$pcount, $seq0+$pcount*0.05, $ydiced; push @plots, with=>'errorbars', ke=>"$ke error", style=>$pcount, $seq0+$pcount*0.05, $ydiced, $se->dice_axis($iD[3],$y)->dice_axis($iD[2],$x) ->dice_axis($iD[1],$_)->squeeze if defined($se); $pcount++; } } } my ($ymin, $ymax) = pdl($self, !defined $se ? () : ($self+$se, $self-$se))->minmax; $w->plot(@plots, { xlabel=>$opt{IVNM}[$iD[0]], ylabel=>$opt{DVNM}, xrange=>[-0.05,$dim0-1+$pcount*0.05], yrange=>[$ymin-0.05,$ymax+0.05] } ); $w; } =head2 plot_residuals Plots residuals against predicted values. =for usage Usage: use PDL::Graphics::Simple; $w = pgswin(); $y->plot_residuals( $y_pred, { win=>$w } ); =for options Default options (case insensitive): # see PDL::Graphics::Simple for more info WIN => undef, # pgswin object. not closed here if passed # allows comparing multiple lines in same plot SIZE => 640, # plot size in pixels COLOR => 1, =cut *plot_residuals = \&PDL::plot_residuals; sub PDL::plot_residuals { require PDL::Graphics::Simple; my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef; my ($y, $y_pred) = @_; my %opt = ( # see PDL::Graphics::Simple for next options WIN => undef, # pgswin object. not closed here if passed # allows comparing multiple lines in same plot SIZE => 640, # plot size in pixels COLOR => 1, ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } my $residuals = $y - $y_pred; my $win = $opt{WIN} || PDL::Graphics::Simple::pgswin(size=>[@opt{qw(SIZE SIZE)}, 'px']); $win->plot( with=>'points', style=>$opt{COLOR}, $y_pred, $residuals, with=>'lines', style=>$opt{COLOR}, pdl($y_pred->minmax), pdl(0,0), # 0-line {xlabel=>'predicted value', ylabel=>'residuals'}, ); } =head2 plot_scores Plots standardized original and PCA transformed scores against two components. (Thank you, Bob MacCallum, for the documentation suggestion that led to this function.) =for options Default options (case insensitive): CORR => 1, # boolean. PCA was based on correlation or covariance COMP => [0,1], # indices to components to plot # see PDL::Graphics::Simple for next options WIN => undef, # pgswin object. not closed here if passed # allows comparing multiple lines in same plot SIZE => 640, # plot size in pixels COLOR => [1,2], # color for original and rotated scores =for usage Usage: my %p = $data->pca(); $data->plot_scores( $p{eigenvector}, \%opt ); =cut *plot_scores = \&PDL::plot_scores; sub PDL::plot_scores { require PDL::Graphics::Simple; my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef; my ($self, $eigvec) = @_; my %opt = ( CORR => 1, # boolean. PCA was based on correlation or covariance COMP => [0,1], # indices to components to plot # see PDL::Graphics::Simple for next options WIN => undef, # pgswin object. not closed here if passed # allows comparing multiple lines in same plot SIZE => 640, # plot size in pixels COLOR => [1,2], # color for original and transformed scoress ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt } my $i = pdl $opt{COMP}; my $z = $opt{CORR} ? $self->stddz : $self->dev_m; # transformed normed values my $scores = sumover($eigvec->slice(':',$i) * $z->transpose->dummy(1))->transpose; $z = $z->slice(':',$i)->sever; my $win = $opt{WIN} || PDL::Graphics::Simple::pgswin(size=>[@opt{qw(SIZE SIZE)}, 'px']); $win->plot( with=>'points', style=>$opt{COLOR}[0], ke=>'original', $z->slice(',(0)'), $z->slice(',(1)'), with=>'points', style=>$opt{COLOR}[1], ke=>'transformed', $scores->slice(',(0)'), $scores->slice(',(1)'), {xlabel=>"Component $opt{COMP}[0]", ylabel=>"Component $opt{COMP}[1]"}, ); } =head2 plot_screes Scree plot. Plots proportion of variance accounted for by PCA components. =for options Default options (case insensitive): NCOMP => 20, # max number of components to plot CUT => 0, # set to plot cutoff line after this many components # undef to plot suggested cutoff line for NCOMP comps # see PDL::Graphics::Simple for next options WIN => undef, # pgswin object. not closed here if passed # allows comparing multiple lines in same plot SIZE => 640, # plot size in pixels =for usage Usage: # variance should be in descending order $d = qsort random 10, 5; # 10 obs on 5 variables %pca = $d->pca( \%opt ); $pca{pct_var}->plot_screes( {ncomp=>16, win=>$win=PDL::Graphics::Simple::pgswin()} ); Or, because NCOMP is used so often, it is allowed a shortcut, $pca{pct_var}->plot_screes( 16 ); =cut *plot_scree = \&PDL::plot_screes; # here for now for compatibility *plot_screes = \&PDL::plot_screes; sub PDL::plot_screes { require PDL::Graphics::Simple; my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef; my ($self, $ncomp) = @_; my %opt = ( NCOMP => 20, # max number of components to plot CUT => 0, # set to plot cutoff line after this many components # undef to plot suggested cutoff line for NCOMP comps # see PDL::Graphics::Simple for next options WIN => undef, # pgswin object. not closed here if passed # allows comparing multiple lines in same plot SIZE => 640, # plot size in pixels ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } $opt{NCOMP} = $ncomp if $ncomp; # re-use $ncomp below $ncomp = ($self->dim(0) < $opt{NCOMP})? $self->dim(0) : $opt{NCOMP}; my $self_comp = $self->slice([0,$ncomp-1]); $opt{CUT} = PDL::Stats::Kmeans::_scree_ind $self_comp if !defined $opt{CUT}; my $win = $opt{WIN} || PDL::Graphics::Simple::pgswin(size=>[@opt{qw(SIZE SIZE)}, 'px']); $win->plot( with=>'lines', ke=>'scree', sequence($ncomp), $self_comp, !$opt{CUT} ? () : (with=>'lines', ke=>'cut', pdl($opt{CUT}-.5, $opt{CUT}-.5), pdl(-.05, $self->max->sclr+.05)), {xlabel=>'Component', ylabel=>'Proportion of Variance Accounted for', xrange=>[-0.05,$ncomp-0.95], yrange=>[0,1], le=>'tr'}, ); } =head2 plot_stripchart Stripchart plot. Plots ANOVA-style data, categorised against given IVs. =for options Default options (case insensitive): IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] DVNM => 'DV', # see PDL::Graphics::Simple for next options WIN => undef, # pgswin object. not closed here if passed =for usage Usage: %m = $y->plot_stripchart( $a, \@b, { IVNM=>[qw(apple bake)] } ); =cut my $CHART_GAP = 0.1; *plot_stripchart = \&PDL::plot_stripchart; sub PDL::plot_stripchart { require PDL::Graphics::Simple; my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef; my ($y, @ivs_raw) = @_; my %opt = ( IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] DVNM => 'DV', WIN => undef, # pgswin object. not closed here if passed ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } $opt{IVNM} = [ map { "IV_$_" } 0 .. $#ivs_raw ] if !$opt{IVNM} or !@{ $opt{IVNM} }; my $w = $opt{WIN} || PDL::Graphics::Simple::pgswin(); my @codes = map [code_ivs($_)], @ivs_raw; my @levels = map { my $map = $_->[1]; [sort {$map->{$a} <=> $map->{$b}} keys %$map]; } @codes; my $xjitter = $y->random * $CHART_GAP; my ($pcount, @plots) = 0; push @plots, with=>'points', ke=>"all data", $xjitter+$pcount, $y; $pcount++; for my $i (0..$#ivs_raw) { my $levs = $levels[$i]; my $name = $opt{IVNM}[$i]; my $coded = $codes[$i][0]; for my $j (0..$#$levs) { my $inds = which($coded == $j); push @plots, with=>'points', ke=>"$name=$levs->[$j]", $xjitter->slice($inds)+$pcount+$j*$CHART_GAP, $y->slice($inds); } $pcount++; } my ($ymin, $ymax) = $y->minmax; my $xmax = $pcount-1 + $CHART_GAP*($#{$levels[-1]} + 2); $w->plot(@plots, { xlabel=>'IV', ylabel=>$opt{DVNM}, xrange=>[-1,$xmax], yrange=>[$ymin-$CHART_GAP,$ymax+$CHART_GAP] } ); $w; } =head1 SEE ALSO L L =head1 REFERENCES Cohen, J., Cohen, P., West, S.G., & Aiken, L.S. (2003). Applied Multiple Regression/correlation Analysis for the Behavioral Sciences (3rd ed.). Mahwah, NJ: Lawrence Erlbaum Associates Publishers. Hosmer, D.W., & Lemeshow, S. (2000). Applied Logistic Regression (2nd ed.). New York, NY: Wiley-Interscience. Lorch, R.F., & Myers, J.L. (1990). Regression analyses of repeated measures data in cognitive research. Journal of Experimental Psychology: Learning, Memory, & Cognition, 16, 149-157. Osgood C.E., Suci, G.J., & Tannenbaum, P.H. (1957). The Measurement of Meaning. Champaign, IL: University of Illinois Press. Rutherford, A. (2011). ANOVA and ANCOVA: A GLM Approach (2nd ed.). Wiley. Shlens, J. (2009). A Tutorial on Principal Component Analysis. Retrieved April 10, 2011 from http://citeseerx.ist.psu.edu/ The GLM procedure: unbalanced ANOVA for two-way design with interaction. (2008). SAS/STAT(R) 9.2 User's Guide. Retrieved June 18, 2009 from http://support.sas.com/ Van den Noortgate, W., & Onghena, P. (2006). Analysing repeated measures data in cognitive research: A comment on regression coefficient analyses. European Journal of Cognitive Psychology, 18, 937-952. =head1 AUTHOR Copyright (C) 2009 Maggie J. Xiong All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution. =cut #line 2491 "lib/PDL/Stats/GLM.pm" # Exit with OK status 1; PDL-Stats-0.855/GENERATED/PDL/Stats/TS.pm0000644000175000017500000003603314762214365017112 0ustar osboxesosboxes# # GENERATED WITH PDL::PP from lib/PDL/Stats/TS.pd! Don't modify! # package PDL::Stats::TS; our @EXPORT_OK = qw(acf acvf dseason fill_ma filter_exp filter_ma mae mape wmape portmanteau pred_ar ); our %EXPORT_TAGS = (Func=>\@EXPORT_OK); use PDL::Core; use PDL::Exporter; use DynaLoader; our @ISA = ( 'PDL::Exporter','DynaLoader' ); push @PDL::Core::PP, __PACKAGE__; bootstrap PDL::Stats::TS ; #line 6 "lib/PDL/Stats/TS.pd" =encoding utf8 =head1 NAME PDL::Stats::TS -- basic time series functions =head1 DESCRIPTION The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are threadable and methods that are NOT threadable, respectively. Plots require L. ***EXPERIMENTAL!*** In particular, bad value support is spotty and may be shaky. USE WITH DISCRETION! =head1 SYNOPSIS use PDL::LiteF; use PDL::Stats::TS; my $r = $data->acf(5); =cut use strict; use warnings; use Carp; use PDL::LiteF; use PDL::Stats::Basic; use PDL::Stats::Kmeans; #line 58 "lib/PDL/Stats/TS.pm" =head1 FUNCTIONS =cut =head2 acf =for sig Signature: (x(t); [o]r(h); IV lag=>h) Types: (float double) =for usage $r = acf($x, $lag); acf($x, $r, $lag); # all arguments given $r = $x->acf($lag); # method call $x->acf($r, $lag); =for ref Autocorrelation function for up to lag h. If h is not specified it's set to t-1 by default. acf does not process bad values. =for example usage: pdl> $a = sequence 10 # lags 0 .. 5 pdl> p $a->acf(5) [1 0.7 0.41212121 0.14848485 -0.078787879 -0.25757576] =pod Broadcasts over its inputs. =for bad C does not process bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut #line 79 "lib/PDL/Stats/TS.pd" sub PDL::acf { my ($self, $h) = @_; $h ||= $self->dim(0) - 1; PDL::_acf_int($self, my $r = PDL->null, $h+1); $r; } #line 123 "lib/PDL/Stats/TS.pm" *acf = \&PDL::acf; =head2 acvf =for sig Signature: (x(t); [o]v(h); IV lag=>h) Types: (float double) =for usage $v = acvf($x, $lag); acvf($x, $v, $lag); # all arguments given $v = $x->acvf($lag); # method call $x->acvf($v, $lag); =for ref Autocovariance function for up to lag h. If h is not specified it's set to t-1 by default. acvf does not process bad values. =for example usage: pdl> $a = sequence 10 # lags 0 .. 5 pdl> p $a->acvf(5) [82.5 57.75 34 12.25 -6.5 -21.25] # autocorrelation pdl> p $a->acvf(5) / $a->acvf(0) [1 0.7 0.41212121 0.14848485 -0.078787879 -0.25757576] =pod Broadcasts over its inputs. =for bad C does not process bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut #line 134 "lib/PDL/Stats/TS.pd" sub PDL::acvf { my ($self, $h) = @_; $h ||= $self->dim(0) - 1; PDL::_acvf_int($self, my $v = PDL->null, $h+1); $v; } #line 190 "lib/PDL/Stats/TS.pm" *acvf = \&PDL::acvf; =head2 dseason =for sig Signature: (x(t); indx d(); [o]xd(t)) Types: (float double) =for usage $xd = dseason($x, $d); dseason($x, $d, $xd); # all arguments given $xd = $x->dseason($d); # method call $x->dseason($d, $xd); =for ref Deseasonalize data using moving average filter the size of period d. =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *dseason = \&PDL::dseason; =head2 fill_ma =for sig Signature: (x(t); indx q(); [o]xf(t)) Types: (float double) =for usage $xf = fill_ma($x, $q); fill_ma($x, $q, $xf); # all arguments given $xf = $x->fill_ma($q); # method call $x->fill_ma($q, $xf); =for ref Fill missing value with moving average. xf(t) = sum(x(t-q .. t-1, t+1 .. t+q)) / 2q. =for bad fill_ma does handle bad values. Output pdl bad flag is cleared unless the specified window size q is too small and there are still bad values. =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut #line 251 "lib/PDL/Stats/TS.pd" sub PDL::fill_ma { my ($x, $q) = @_; PDL::_fill_ma_int($x, $q, my $x_filled = PDL->null); $x_filled->check_badflag; # carp "ma window too small, still has bad value" # if $x_filled->badflag; return $x_filled; } #line 284 "lib/PDL/Stats/TS.pm" *fill_ma = \&PDL::fill_ma; =head2 filter_exp =for sig Signature: (x(t); a(); [o]xf(t)) Types: (float double) =for usage $xf = filter_exp($x, $a); filter_exp($x, $a, $xf); # all arguments given $xf = $x->filter_exp($a); # method call $x->filter_exp($a, $xf); =for ref Filter, exponential smoothing. xf(t) = a * x(t) + (1-a) * xf(t-1) =pod Broadcasts over its inputs. =for bad C does not process bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *filter_exp = \&PDL::filter_exp; =head2 filter_ma =for sig Signature: (x(t); indx q(); [o]xf(t)) Types: (float double) =for usage $xf = filter_ma($x, $q); filter_ma($x, $q, $xf); # all arguments given $xf = $x->filter_ma($q); # method call $x->filter_ma($q, $xf); =for ref Filter, moving average. xf(t) = sum(x(t-q .. t+q)) / (2q + 1) =pod Broadcasts over its inputs. =for bad C does not process bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *filter_ma = \&PDL::filter_ma; =head2 mae =for sig Signature: (a(n); b(n); [o]c()) Types: (float double) =for usage $c = mae($a, $b); mae($a, $b, $c); # all arguments given $c = $a->mae($b); # method call $a->mae($b, $c); =for ref Mean absolute error. MAE = 1/n * sum( abs(y - y_pred) ) =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *mae = \&PDL::mae; =head2 mape =for sig Signature: (a(n); b(n); [o]c()) Types: (float double) =for usage $c = mape($a, $b); mape($a, $b, $c); # all arguments given $c = $a->mape($b); # method call $a->mape($b, $c); =for ref Mean absolute percent error. MAPE = 1/n * sum(abs((y - y_pred) / y)) =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *mape = \&PDL::mape; =head2 wmape =for sig Signature: (a(n); b(n); [o]c()) Types: (float double) =for usage $c = wmape($a, $b); wmape($a, $b, $c); # all arguments given $c = $a->wmape($b); # method call $a->wmape($b, $c); =for ref Weighted mean absolute percent error. avg(abs(error)) / avg(abs(data)). Much more robust compared to mape with division by zero error (cf. Schütz, W., & Kolassa, 2006). =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *wmape = \&PDL::wmape; =head2 portmanteau =for sig Signature: (r(h); longlong t(); [o]Q()) Types: (float double) =for usage $Q = portmanteau($r, $t); portmanteau($r, $t, $Q); # all arguments given $Q = $r->portmanteau($t); # method call $r->portmanteau($t, $Q); =for ref Portmanteau significance test (Ljung-Box) for autocorrelations. =for example Usage: pdl> $a = sequence 10 # acf for lags 0-5 # lag 0 excluded from portmanteau pdl> p $chisq = $a->acf(5)->portmanteau( $a->nelem ) 11.1753902662994 # get p-value from chisq distr pdl> use PDL::GSL::CDF pdl> p 1 - gsl_cdf_chisq_P( $chisq, 5 ) 0.0480112934306748 =pod Broadcasts over its inputs. =for bad C does not process bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *portmanteau = \&PDL::portmanteau; =head2 pred_ar =for sig Signature: (x(p); b(p); [o]pred(t); IV end=>t) Types: (float double) =for usage $pred = pred_ar($x, $b, $end); pred_ar($x, $b, $pred, $end); # all arguments given $pred = $x->pred_ar($b, $end); # method call $x->pred_ar($b, $pred, $end); =for ref Calculates predicted values up to period t (extend current series up to period t) for autoregressive series, with or without constant. If there is constant, it is the last element in b, as would be returned by ols or ols_t. pred_ar does not process bad values. =for options CONST => 1, =for example Usage: pdl> $x = sequence 2 # last element is constant pdl> $b = pdl(.8, -.2, .3) pdl> p $x->pred_ar($b, 7) [0 1 1.1 0.74 0.492 0.3656 0.31408] # no constant pdl> p $x->pred_ar($b(0:1), 7, {const=>0}) [0 1 0.8 0.44 0.192 0.0656 0.01408] =pod Broadcasts over its inputs. =for bad C does not process bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut #line 425 "lib/PDL/Stats/TS.pd" sub PDL::pred_ar { my ($x, $b, $t, $opt) = @_; my %opt = ( CONST => 1 ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } $b = PDL->topdl($b); # allows passing simple number my $ext; if ($opt{CONST}) { my $t_ = $t - ( $x->dim(0) - $b->dim(0) + 1 ); PDL::_pred_ar_int($x->slice([-$b->dim(0)+1,-1]), $b->slice('0:-2'), $ext = PDL->null, $t_); $ext->slice([$b->dim(0)-1,-1]) += $b->slice(-1); return $x->append( $ext->slice([$b->dim(0)-1,-1]) ); } else { my $t_ = $t - ( $x->dim(0) - $b->dim(0) ); PDL::_pred_ar_int($x->slice([-$b->dim(0),-1]), $b, $ext = PDL->null, $t_); return $x->append($ext->slice([$b->dim(0),-1])); } } #line 619 "lib/PDL/Stats/TS.pm" *pred_ar = \&PDL::pred_ar; #line 472 "lib/PDL/Stats/TS.pd" #line 473 "lib/PDL/Stats/TS.pd" =head2 season_m Given length of season, returns seasonal mean and variance for each period (returns seasonal mean only in scalar context). =for options Default options (case insensitive): START_POSITION => 0, # series starts at this position in season MISSING => -999, # internal mark for missing points in season PLOT => 0, # boolean # see PDL::Graphics::Simple for next options WIN => undef, # pass pgswin object for more plotting control COLOR => 1, =for usage my ($m, $ms) = $data->season_m( 24, { START_POSITION=>2 } ); =cut *season_m = \&PDL::season_m; sub PDL::season_m { my ($self, $d, $opt) = @_; my %opt = ( START_POSITION => 0, # series starts at this position in season MISSING => -999, # internal mark for missing points in season PLOT => 0, WIN => undef, # pass pgswin object for more plotting control COLOR => 1, ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } my $n_season = ($self->dim(0) + $opt{START_POSITION}) / $d; $n_season = pdl($n_season)->ceil->sum->sclr; my @dims = $self->dims; $dims[0] = $n_season * $d; my $data = zeroes( @dims ) + $opt{MISSING}; $data->slice([$opt{START_POSITION},$opt{START_POSITION} + $self->dim(0)-1]) .= $self; $data->badflag(1); $data->inplace->setvaltobad( $opt{MISSING} ); my $s = sequence $d; $s = $s->dummy(1, $n_season)->flat; $s = $s->iv_cluster(); my ($m, $ms) = $data->centroid( $s ); if ($opt{PLOT}) { require PDL::Graphics::Simple; my $w = $opt{WIN} || PDL::Graphics::Simple::pgswin(); my $seq = sequence($d); my $errb_length = sqrt( $ms / $s->sumover )->squeeze; my $col = $opt{COLOR}; my @plots = map +(with=>'lines', ke=>"Data $col", style=>$col++, $seq, $_), $m->dog; push @plots, with=>'errorbars', ke=>'Error', style=>$opt{COLOR}, $seq, $m->squeeze, $errb_length if $m->squeeze->ndims < 2 && ($errb_length > 0)->any; $w->plot(@plots, { xlabel=>'period', ylabel=>'mean' }); } return wantarray? ($m, $ms) : $m; } =head2 plot_dseason =for ref Plots deseasonalized data and original data points. Opens and closes default window for plotting unless a C object is passed in options. Returns deseasonalized data. =for options Default options (case insensitive): WIN => undef, COLOR => 1, # data point color =cut *plot_dseason = \&PDL::plot_dseason; sub PDL::plot_dseason { require PDL::Graphics::Simple; my ($self, $d, $opt) = @_; !defined($d) and croak "please set season period length"; $self = $self->squeeze; my %opt = ( WIN => undef, COLOR => 1, # data point color ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } my $dsea = $self->dseason($d); my $w = $opt{WIN} || PDL::Graphics::Simple::pgswin(); my $seq = sequence($self->dim(0)); my $col = $opt{COLOR}; my @plots = map +(with=>'lines', ke=>"Data $col", style=>$col++, $seq, $_), $dsea->dog; $col = $opt{COLOR}; push @plots, map +(with=>'points', ke=>"De-seasonalised $col", style=>$col++, $seq, $_), $self->dog; $w->plot(@plots, { xlabel=>'T', ylabel=>'DV' }); return $dsea; } =head1 METHODS =head2 plot_acf =for ref Plots and returns autocorrelations for a time series. =for options Default options (case insensitive): SIG => 0.05, # can specify .10, .05, .01, or .001 WIN => undef, =for usage Usage: pdl> $a = sequence 10 pdl> p $r = $a->plot_acf(5) [1 0.7 0.41212121 0.14848485 -0.078787879 -0.25757576] =cut *plot_acf = \&PDL::plot_acf; sub PDL::plot_acf { require PDL::Graphics::Simple; my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef; my ($self, $h) = @_; my $r = $self->acf($h); my %opt = ( SIG => 0.05, WIN => undef, ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } my $y_sig = ($opt{SIG} == 0.10)? 1.64485362695147 : ($opt{SIG} == 0.05)? 1.95996398454005 : ($opt{SIG} == 0.01)? 2.5758293035489 : ($opt{SIG} == 0.001)? 3.29052673149193 : 0 ; unless ($y_sig) { carp "SIG outside of recognized value. default to 0.05"; $y_sig = 1.95996398454005; } my $w = $opt{WIN} || PDL::Graphics::Simple::pgswin(); my $seq = pdl(-1,$h+1); my $y_seq = ones(2) * $y_sig / sqrt($self->dim(0)) * -1; $w->plot( with=>'lines', $seq, zeroes(2), # x axis with=>'lines', style=>2, $seq, $y_seq, with=>'lines', style=>2, $seq, -$y_seq, (map +(with=>'lines', ones(2)*$_, pdl(0, $r->slice("($_)"))), 0..$h), { xlabel=>'lag', ylabel=>'acf', } ); $r; } =head1 REFERENCES Brockwell, P.J., & Davis, R.A. (2002). Introduction to Time Series and Forecasting (2nd ed.). New York, NY: Springer. Schütz, W., & Kolassa, S. (2006). Foresight: advantages of the MAD/Mean ratio over the MAPE. Retrieved Jan 28, 2010, from http://www.saf-ag.com/226+M5965d28cd19.html =head1 AUTHOR Copyright (C) 2009 Maggie J. Xiong All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution. =cut #line 808 "lib/PDL/Stats/TS.pm" # Exit with OK status 1; PDL-Stats-0.855/GENERATED/PDL/Stats/Kmeans.pm0000644000175000017500000005024614762214365020004 0ustar osboxesosboxes# # GENERATED WITH PDL::PP from lib/PDL/Stats/Kmeans.pd! Don't modify! # package PDL::Stats::Kmeans; our @EXPORT_OK = qw(random_cluster iv_cluster _random_cluster which_cluster assign centroid _d_p2l ); our %EXPORT_TAGS = (Func=>\@EXPORT_OK); use PDL::Core; use PDL::Exporter; use DynaLoader; our @ISA = ( 'PDL::Exporter','DynaLoader' ); push @PDL::Core::PP, __PACKAGE__; bootstrap PDL::Stats::Kmeans ; #line 8 "lib/PDL/Stats/Kmeans.pd" use strict; use warnings; use Carp; use PDL::LiteF; use PDL::Stats::Basic; =head1 NAME PDL::Stats::Kmeans -- classic k-means cluster analysis =head1 DESCRIPTION Assumes that we have data pdl dim [observation, variable] and the goal is to put observations into clusters based on their values on the variables. The terms "observation" and "variable" are quite arbitrary but serve as a reminder for "that which is being clustered" and "that which is used to cluster". The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are broadcastable and methods that are non-broadcastable, respectively. =head1 SYNOPSIS Implement a basic k-means procedure, use PDL::LiteF; use PDL::Stats; my ($data, $idv, $ido) = rtable( $file ); # or generate random data: $data = grandom(200, 2); # two vars as below my ($cluster, $centroid, $ss_centroid, $cluster_last); # start out with 8 random clusters $cluster = random_cluster( $data->dim(0), 8 ); # iterate to minimize total ss # stop when no more changes in cluster membership do { $cluster_last = $cluster; ($centroid, $ss_centroid) = $data->centroid( $cluster ); $cluster = $data->assign( $centroid ); } while sum(abs($cluster - $cluster_last)) > 0; or, use the B function provided here, my %k = $data->kmeans( \%opt ); print "$_\t$k{$_}\n" for sort keys %k; plot the clusters if there are only 2 vars in $data, use PDL::Graphics::Simple; my ($win, $c); $win = pgswin(); $win->plot(map +(with=>'points', $data->dice_axis(0,which($k{cluster}->(,$_)))->dog), 0 .. $k{cluster}->dim(1)-1); =cut #line 82 "lib/PDL/Stats/Kmeans.pm" =head1 FUNCTIONS =cut #line 75 "lib/PDL/Stats/Kmeans.pd" #line 76 "lib/PDL/Stats/Kmeans.pd" =head2 random_cluster =for ref Creates masks for random mutually exclusive clusters. Accepts two parameters, num_obs and num_cluster. Extra parameter turns into extra dim in mask. May loop a long time if num_cluster approaches num_obs because empty cluster is not allowed. =for usage my $cluster = random_cluster( $num_obs, $num_cluster ); =cut # can't be called on pdl sub random_cluster { my ($obs, $clu, @extra) = @_; # extra param in @_ made into extra dim my $cluster = zeroes short(), @_; do { (random($obs, @extra) * $obs)->_random_cluster($cluster); } while (PDL::any $cluster->sumover == 0 ); $cluster; } #line 122 "lib/PDL/Stats/Kmeans.pm" *_random_cluster = \&PDL::_random_cluster; =head2 which_cluster =for sig Signature: (short a(o,c); indx [o]b(o)) Types: (ushort long) =for usage $b = which_cluster($a); which_cluster($a, $b); # all arguments given $b = $a->which_cluster; # method call $a->which_cluster($b); Given cluster mask dim [obs x clu], returns the cluster index to which an obs belong. Does not support overlapping clusters. If an obs has TRUE value for multiple clusters, the returned index is the first cluster the obs belongs to. If an obs has no TRUE value for any cluster, the return val is set to -1 or BAD if the input mask has badflag set. Usage: # create a cluster mask dim [obs x clu] pdl> p $c_mask = iv_cluster [qw(a a b b c c)] [ [1 1 0 0 0 0] [0 0 1 1 0 0] [0 0 0 0 1 1] ] # get cluster membership list dim [obs] pdl> p $ic = $c_mask->which_cluster [0 0 1 1 2 2] =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *which_cluster = \&PDL::which_cluster; =head2 assign =for sig Signature: (data(o,v); centroid(c,v); short [o]cluster(o,c)) Types: (float double) =for ref Takes data pdl dim [obs x var] and centroid pdl dim [cluster x var] and returns mask dim [obs x cluster] to cluster membership. An obs is assigned to the first cluster with the smallest distance (ie sum squared error) to cluster centroid. With bad value, obs is assigned by smallest mean squared error across variables. =for usage pdl> p $centroid = xvals 2, 3 [ [0 1] [0 1] [0 1] ] pdl> p $b = qsort( random 4, 3 ) [ [0.022774068 0.032513883 0.13890034 0.30942479] [ 0.16943853 0.50262636 0.56251531 0.7152271] [ 0.23964483 0.59932745 0.60967495 0.78452117] ] # notice that 1st 3 obs in $b are on average closer to 0 # and last obs closer to 1 pdl> p $b->assign( $centroid ) [ [1 1 1 0] # cluster 0 membership [0 0 0 1] # cluster 1 membership ] =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *assign = \&PDL::assign; =head2 centroid =for sig Signature: (data(o,v); cluster(o,c); [o]m(c,v); [o]ss(c,v)) Types: (float double) =for ref Takes data dim [obs x var] and mask dim [obs x cluster], returns mean and ss (ms when data contains bad values) dim [cluster x var], using data where mask == 1. Multiple cluster membership for an obs is okay. If a cluster is empty all means and ss are set to zero for that cluster. =for usage # data is 10 obs x 3 var pdl> p $d = sequence 10, 3 [ [ 0 1 2 3 4 5 6 7 8 9] [10 11 12 13 14 15 16 17 18 19] [20 21 22 23 24 25 26 27 28 29] ] # create two clusters by value on 1st var pdl> p $a = $d( ,(0)) <= 5 [1 1 1 1 1 1 0 0 0 0] pdl> p $b = $d( ,(0)) > 5 [0 0 0 0 0 0 1 1 1 1] pdl> p $c = cat $a, $b [ [1 1 1 1 1 1 0 0 0 0] [0 0 0 0 0 0 1 1 1 1] ] pdl> p $d->centroid($c) # mean for 2 cluster x 3 var [ [ 2.5 7.5] [12.5 17.5] [22.5 27.5] ] # ss for 2 cluster x 3 var [ [17.5 5] [17.5 5] [17.5 5] ] =pod Broadcasts over its inputs. =for bad C processes bad values. It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays. =cut *centroid = \&PDL::centroid; #line 307 "lib/PDL/Stats/Kmeans.pd" #line 308 "lib/PDL/Stats/Kmeans.pd" sub _scree_ind { # use as scree cutoff the point with max distance to the line formed # by the 1st and last points in $self # it's a heuristic--whether we can get "good" results depends on # the number of components in $self. my ($self) = @_; $self = $self->squeeze; $self->ndims > 1 and croak "1D pdl only please"; my $a = zeroes 2, $self->nelem; $a->slice('(0)') .= sequence $self->nelem; $a->slice('(1)') .= $self; my $d = _d_point2line( $a, $a->slice(':,(0)'), $a->slice(':,(-1)') ); return $d->maximum_ind; } sub _d_point2line { my ($self, $p1, $p2) = @_; for ($self, $p1, $p2) { $_->dim(0) != 2 and carp "point pdl dim(0) != 2"; } return _d_p2l( $self->mv(0,-1)->dog, $p1->mv(0,-1)->dog, $p2->mv(0,-1)->dog ); } #line 341 "lib/PDL/Stats/Kmeans.pm" *_d_p2l = \&PDL::_d_p2l; #line 358 "lib/PDL/Stats/Kmeans.pd" #line 359 "lib/PDL/Stats/Kmeans.pd" =head2 kmeans =for ref Implements classic k-means cluster analysis. =for example $data = grandom(200, 2); # two rows = two dimensions %k = $data->kmeans; # use default of 3 clusters print "$_\t$k{$_}\n" for sort keys %k; $w->plot( (map +(with=>'points', style=>$_+1, ke=>"Cluster ".($_+1), $data->dice_axis(0,which($k{cluster}->slice(",$_")))->dog), 0 .. $k{cluster}->dim(1)-1), (map +(with=>'circles', style=>$_+1, ke=>"Centroid ".($_+1), $k{centroid}->slice($_)->dog, 0.1), 0 .. $k{centroid}->dim(0)-1), {le=>'tr'}, ); Given a number of observations with values on a set of variables, kmeans puts the observations into clusters that maximizes within-cluster similarity with respect to the variables. Tries several different random seeding and clustering in parallel. Stops when cluster assignment of the observations no longer changes. Returns the best result in terms of R2 from the random-seeding trials. Instead of random seeding, kmeans also accepts manual seeding. This is done by providing a centroid to the function, in which case clustering will proceed from the centroid and there is no multiple tries. There are two distinct advantages from seeding with a centroid compared to seeding with predefined cluster membership of a subset of the observations ie "seeds": =over =item * a centroid could come from a previous study with a different set of observations; =item * a centroid could even be "fictional", or in more proper parlance, an idealized prototype with respect to the actual data. For example, if there are 10 person's ratings of 1 to 5 on 4 movies, ie a ratings pdl of dim [10 obs x 4 var], providing a centroid like [ [5 0 0 0] [0 5 0 0] [0 0 5 0] [0 0 0 5] ] will produce 4 clusters of people with each cluster favoring a different one of the 4 movies. Clusters from an idealized centroid may not give the best result in terms of R2, but they sure are a lot more interpretable. =back If clustering has to be done from predefined clusters of seeds, simply calculate the centroid using the B function and feed it to kmeans, my ($centroid, $ss) = $rating($iseeds, )->centroid( $seeds_cluster ); my %k = $rating->kmeans( { CNTRD=>$centroid } ); kmeans supports bad value*. =for options Default options (case insensitive): V => 1, # prints simple status FULL => 0, # returns results for all seeding trials CNTRD => PDL->null, # optional. pdl [clu x var]. disables next 3 opts NTRY => 5, # num of random seeding trials NSEED => 1000, # num of initial seeds, use NSEED up to max obs NCLUS => 3, # num of clusters =for usage Usage: # suppose we have 4 person's ratings on 5 movies pdl> p $rating = ceil( random(4, 5) * 5 ) [ [3 2 2 3] [2 4 5 4] [5 3 2 3] [3 3 1 5] [4 3 3 2] ] # we want to put the 4 persons into 2 groups pdl> %k = $rating->kmeans( {NCLUS=>2} ) # by default prints back options used # as well as info for all tries and iterations CNTRD => Null FULL => 0 NCLUS => 3 NSEED => 4 NTRY => 5 V => 1 ss total: 20.5 iter 0 R2 [0.024390244 0.024390244 0.26829268 0.4796748 0.4796748] iter 1 R2 [0.46341463 0.46341463 0.4796748 0.4796748 0.4796748] pdl> p "$_\t$k{$_}\n" for sort keys %k R2 0.479674796747968 centroid # mean ratings for 2 group x 5 movies [ [ 3 2.3333333] [ 2 4.3333333] [ 5 2.6666667] [ 3 3] [ 4 2.6666667] ] cluster # 4 persons' membership in two groups [ [1 0 0 0] [0 1 1 1] ] n [1 3] # cluster size ss [ [ 0 0.66666667] [ 0 0.66666667] [ 0 0.66666667] [ 0 8] [ 0 0.66666667] ] Now, for the valiant, kmeans is broadcastable. Say you gathered 10 persons' ratings on 5 movies from 2 countries, so the data is dim [10,5,2], and you want to put the 10 persons from each country into 3 clusters, just specify NCLUS => [3,1], and there you have it. The key is for NCLUS to include $data->ndims - 1 numbers. The 1 in [3,1] turns into a dummy dim, so the 3-cluster operation is repeated on both countries. Similarly, when seeding, CNTRD needs to have ndims that at least match the data ndims. Extra dims in CNTRD will lead to broadcasting (convenient if you want to try out different centroid locations, for example, but you will have to hand pick the best result). See F for examples w 3D and 4D data. *With bad value, R2 is based on average of variances instead of sum squared error. =cut *kmeans = \&PDL::kmeans; sub PDL::kmeans { my ($self, $opt) = @_; my %opt = ( V => 1, # prints simple status FULL => 0, # returns results for all seeding trials CNTRD => PDL->null, # optional. pdl [clu x var]. disables next 3 opts NTRY => 5, # num of random seeding trials NSEED => 1000, # num of initial seeds, use NSEED up to max obs NCLUS => 3, # num of clusters ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } if (defined($opt{CNTRD}) and $opt{CNTRD}->nelem) { $opt{NTRY} = 1; $opt{NSEED} = $self->dim(0); $opt{NCLUS} = $opt{CNTRD}->dim(0); } else { $opt{NSEED} = pdl($self->dim(0), $opt{NSEED})->min->sclr; } $opt{V} and print "$_\t=> $opt{$_}\n" for sort keys %opt; my $ss_ms = $self->badflag? 'ms' : 'ss'; my $ss_total = $self->badflag? $self->var->average : $self->ss->sumover; $opt{V} and print "overall $ss_ms:\t$ss_total\n"; my ($centroid, $ss_cv, $R2, $clus_this, $clus_last); # NTRY made into extra dim in $cluster for broadcasting my @nclus = (ref $opt{NCLUS} eq 'ARRAY')? @{$opt{NCLUS}} : ($opt{NCLUS}); $clus_this = (defined($opt{CNTRD}) and $opt{CNTRD}->nelem) ? $self->assign( $opt{CNTRD}->dummy(-1) ) # put dummy(-1) to match NTRY : random_cluster($opt{NSEED}, @nclus, $opt{NTRY} ) ; ($centroid, $ss_cv) = $self->slice([0,$opt{NSEED} - 1])->centroid( $clus_this ); # now obs in $clus_this matches $self $clus_this = $self->assign( $centroid ); ($centroid, $ss_cv) = $self->centroid( $clus_this ); my $iter = 0; do { $R2 = $self->badflag? 1 - $ss_cv->average->average / $ss_total : 1 - $ss_cv->sumover->sumover / $ss_total ; $opt{V} and print join(' ',('iter', $iter++, 'R2', $R2)) . "\n"; $clus_last = $clus_this; $clus_this = $self->assign( $centroid ); ($centroid, $ss_cv) = $self->centroid( $clus_this ); } while ( any long(abs($clus_this - $clus_last))->sumover->sumover > 0 ); $opt{FULL} and return ( centroid => PDL::squeeze( $centroid ), cluster => PDL::squeeze( $clus_this ), n => PDL::squeeze( $clus_this )->sumover, R2 => PDL::squeeze( $R2 ), $ss_ms => PDL::squeeze( $ss_cv ), ); # xchg/mv(-1,0) leaves it as was if single dim--unlike transpose my $i_best = $R2->mv(-1,0)->maximum_ind; $R2->getndims == 1 and return ( centroid => $centroid->dice_axis(-1,$i_best)->sever->squeeze, cluster => $clus_this->dice_axis(-1,$i_best)->sever->squeeze, n => $clus_this->dice_axis(-1,$i_best)->sever->squeeze->sumover, R2 => $R2->dice_axis(-1,$i_best)->sever->squeeze, $ss_ms => $ss_cv->dice_axis(-1,$i_best)->sever->squeeze, ); # now for broadcasting beyond 2D data # can't believe i'm using a perl loop :P $i_best = $i_best->flat->sever; my @i_best = map { $opt{NTRY} * $_ + $i_best->slice("($_)") } 0 .. $i_best->nelem - 1; my @shapes; for ($centroid, $clus_this, $R2) { my @dims = $_->dims; pop @dims; push @shapes, \@dims; } $clus_this = $clus_this->mv(-1,2)->clump(2..$clus_this->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[1] } )->sever, return ( centroid => $centroid->mv(-1,2)->clump(2..$centroid->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[0] } )->sever, cluster => $clus_this, n => $clus_this->sumover, R2 => $R2->mv(-1,0)->clump(0..$R2->ndims-1)->dice_axis(0,\@i_best)->sever->reshape( @{ $shapes[2] } )->sever, $ss_ms => $ss_cv->mv(-1,2)->clump(2..$ss_cv->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[0] } )->sever, ); } =head1 METHODS =head2 iv_cluster =for ref Turns an independent variable into a cluster pdl. Returns cluster pdl and level-to-pdl_index mapping in list context and cluster pdl only in scalar context. This is the method used for mean and var in anova. The difference between iv_cluster and dummy_code is that iv_cluster returns pdl dim [obs x level] whereas dummy_code returns pdl dim [obs x (level - 1)]. =for usage Usage: pdl> @bake = qw( y y y n n n ) # accepts @ ref or 1d pdl pdl> p $bake = iv_cluster( \@bake ) [ [1 1 1 0 0 0] [0 0 0 1 1 1] ] pdl> p $rating = sequence 6 [0 1 2 3 4 5] pdl> p $rating->centroid( $bake ) # mean for each iv level [ [1 4] ] # ss [ [2 2] ] =cut *iv_cluster = \&PDL::iv_cluster; sub PDL::iv_cluster { my ($var_ref) = @_; my ($var, $map_ref) = PDL::Stats::Basic::code_ivs( $var_ref ); my $var_a = yvals( short, $var->nelem, $var->max->sclr + 1 ) == $var; $var_a = $var_a->setbadif( $var->isbad ) if $var->badflag; wantarray ? ($var_a, $map_ref) : $var_a; } =head2 pca_cluster Assign variables to components ie clusters based on pca loadings or scores. One way to seed kmeans (see Ding & He, 2004, and Su & Dy, 2004 for other ways of using pca with kmeans). Variables are assigned to their most associated component. Note that some components may not have any variable that is most associated with them, so the returned number of clusters may be smaller than NCOMP. Default options (case insensitive): V => 1, ABS => 1, # high pos and neg loadings on a comp in same cluster NCOMP => undef, # max number of components to consider. determined by # scree plot black magic if not specified PLOT => 0, # pca scree plot with cutoff at NCOMP WIN => undef, # pass pgswin object for more plotting control Usage: # say we need to cluster a group of documents # $data is pdl dim [word x doc] ($data, $idd, $idw) = get_data 'doc_word_info.txt'; pdl> %p = $data->pca; # $cluster is pdl mask dim [doc x ncomp] pdl> $cluster = $p{loading}->pca_cluster; # pca clusters var while kmeans clusters obs. hence transpose pdl> ($m, $ss) = $data->transpose->centroid( $cluster ); pdl> %k = $data->transpose->kmeans( { cntrd=>$m } ); # take a look at cluster 0 doc ids pdl> p join("\n", @$idd[ list which $k{cluster}->( ,0) ]); =cut *pca_cluster = \&PDL::pca_cluster; sub PDL::pca_cluster { my ($self, $opt) = @_; my %opt = ( V => 1, ABS => 1, # high pos and neg loadings on a comp in same cluster NCOMP => undef, # max number of components to consider. determined by # scree plot black magic if not specified PLOT => 0, # pca scree plot with cutoff at NCOMP WIN => undef, # pass pgswin object for more plotting control ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } my $var = sumover($self ** 2) / $self->dim(0); if (!$opt{NCOMP}) { # here's the black magic part my $comps = ($self->dim(1) > 300)? int($self->dim(1) * .1) : pdl($self->dim(1), 30)->min ; $var = $var->slice([0,$comps-1])->sever; $opt{NCOMP} = _scree_ind( $var ); } $opt{PLOT} and do { require PDL::Stats::GLM; $var->plot_screes({NCOMP=>$var->dim(0), CUT=>$opt{NCOMP}, WIN=>$opt{WIN}}); }; my $c = $self->slice(':',[0,$opt{NCOMP}-1])->transpose->abs->maximum_ind; if ($opt{ABS}) { $c = $c->iv_cluster; } else { my @c = map { ($self->slice($_,$c->slice($_)) >= 0)? $c->slice($_)*2 : $c->slice($_)*2 + 1 } ( 0 .. $c->dim(0)-1 ); $c = iv_cluster( \@c ); } $opt{V} and print "cluster membership mask as " . $c->info . "\n"; return $c; } =head1 REFERENCES Ding, C., & He, X. (2004). K-means clustering via principal component analysis. Proceedings of the 21st International Conference on Machine Learning, 69, 29. Su, T., & Dy, J. (2004). A deterministic method for initializing K-means clustering. 16th IEEE International Conference on Tools with Artificial Intelligence, 784-786. Romesburg, H.C. (1984). Cluster Analysis for Researchers. NC: Lulu Press. Wikipedia (retrieved June, 2009). K-means clustering. http://en.wikipedia.org/wiki/K-means_algorithm =head1 AUTHOR Copyright (C) 2009 Maggie J. Xiong All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution. =cut #line 767 "lib/PDL/Stats/Kmeans.pm" # Exit with OK status 1; PDL-Stats-0.855/META.yml0000644000175000017500000000157614762214364014467 0ustar osboxesosboxes--- abstract: 'a collection of statistics modules in Perl Data Language, with a quick-start guide for non-PDL people.' author: - 'Maggie J. Xiong ' build_requires: PDL: '2.099' Test::More: '0.88' Test::PDL: '0.21' configure_requires: PDL: '2.099' dynamic_config: 1 generated_by: 'ExtUtils::MakeMaker version 7.7106, CPAN::Meta::Converter version 2.150010' license: perl meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html version: '1.4' name: PDL-Stats no_index: directory: - t - inc recommends: PDL::GSL: '0' PDL::Graphics::Simple: '0' requires: PDL: '2.099' perl: '5.016' resources: IRC: irc://irc.perl.org/#pdl bugtracker: https://github.com/PDLPorters/PDL-Stats/issues repository: git://github.com/PDLPorters/PDL-Stats.git version: '0.855' x_serialization_backend: 'CPAN::Meta::YAML version 0.018' PDL-Stats-0.855/Changes0000644000175000017500000002232714762214301014475 0ustar osboxesosboxes0.855 2025-03-06 - fix recent change that allowed negative variance and standard deviation (#34) - thanks @shawnlaffan for report 0.854 2025-02-25 - fix anova_rptd problem depending on grouping of IVs (part of RT#97925) - do all plotting with PDL::Graphics::Simple - add Basic::code_ivs - all OLS and ANOVA now add the 1s column in design at pos 0 so intercepts now B_0 - add GLM::anova_design_matrix - fix anova_design_matrix for between-subject >2 values (rest of RT#97925) - add demo - remove filt_exp and filt_ma, deprecated in 2011 - now an exception to give anova n-observations <= product of categories in IVs - anova cell means keys now separated by "|" not "#" to sort logically - add GLM::plot_stripchart 0.853 2025-01-03 - uses PDL 2.096+ lib/*.pd format for quicker builds 0.852 2024-12-28 - no longer uses PDL::Slatec 0.851 2024-12-27 - relax the tests' type requirement 0.850 2024-12-27 - removed TS::inte (identical to cumusumover) - moved TS::diff (unused here) to Ufunc - move PDL::Stats::Distr into PDL::GSL since needs GSL - minimum 5.16 as some 5.15 broke PDL::Stats (https://github.com/Perl/perl5/issues/11835) - fix unnecessary type-specs that cause problems for Inplace (https://github.com/PDLPorters/pdl/issues/511) 0.84 2024-05-27 - effect_code exception if only one unique value - fix for Distr so M_PI found - thanks @shawnlaffan 0.83 2023-04-01 - install pdldoc using PDL::Doc 0.82 2022-05-23 - dodge false-negative on NiceSlice 0.81 2022-01-11 - fix Kmeans NiceSlice for PDL 2.066 0.80 2021-10-02 - add COV option to GLM::logistic - thanks David Myers for suggestion (in 2011) - make all PLOT options default to off; only load PGPLOT when try to use - remove plot_season and dsea deprecated since 0.5.0 - effect_code and iv_cluster now don't croak on multi-dim inputs - opt in to upcoming PDL multi-C feature 0.79 2021-08-17 - update now that PDL's ->sum etc return ndarrays, not Perl scalars 0.78 2021-04-24 - update build to avoid false positives for working CDF library 0.77 2021-04-20 - moved the GSL CDF functions over to main PDL - thanks @maggiexyz! 0.76 2020-12-19 - spelling fixes - thanks @sebastic 0.75 2016-05-30 - no change from 0.74_1 0.74_1 2016-05-28 - build updates 0.74 Fri Dec 4 22:09:47 2015 -0500 - no change from 0.73_1 0.73_1 Sun Nov 29 10:15:44 2015 -0500 - Improved support for 64bit indexing (kmx) - PDL::GSL::CDF facelift (kmx) - kmeans test workaround (kmx) - messages for various tests (kaoru) - doc-building improvements 0.72 Mon Aug 31 18:54:08 BST 2015 - changelog test in CI - remove erroneous dep on PDL::GSL::CDF 0.71 Sat Aug 29 18:54:08 BST 2015 - Beef up CI with optional requirements spelled out and apt installed - kmeans and kmeans_bad tests retry, marked TODO as fails on clang - add badges to README and rename to Markdown file - Print out kmeans_bad test for debugging. - Metadata fix - Add manifest test, remove META.yml from main MANIFEST 0.70 18/08/2015 - Updated to work with PDL-2.008. More tests for anova. 0.6.5 11/03/2013 - Updated to work with PDL-2.007. Real 64bit support yet to come. 0.6.4 09/05/2013 - Make 'cpan PDL::Stats' work. 0.6.3 05/05/2013 - binomial_test() bug fix -- [rt.cpan.org #82725] - pmf_poisson() updated to handle large input values thanks to David Mertens. - ols() bad value support update thanks to Ingo Schmid. - anova(), anova_rptd() bad value support update. 0.6.2 04/29/2012 - Windows pdldoc database install fix. 0.6.1 04/24/2012 - Bails out of installation if running under perl 5.15.0 to 5.15.8 but allows --force to attempt a build regardless. 0.6.0 04/08/2012 - Added group_by() method in Basic.pm. - Updates pdl doc database. - Bails out of installation if running under perl 5.14.0 to 5.15.8. - Removed old deprecated method get_data() from Basic.pm - Moved no slatec warning from module to test. Removed lvalue assignments from tests. 0.5.5 04/22/2011 - GLM Windows 32-bit test bug fix; GLM quits installation with PDL-2.4.7 warning. Documentation update. 0.5.4 04/19/2011 - GLM pca() and plot_scores() can do cov version. - [rt.cpan.org #67557] Undo PREREQ_FATAL => 1 in Makefile.PL. 0.5.3 04/18/2011 - stats_glm.t pca() test SLATEC vs. MatrixOps precision fix. 0.5.2 04/17/2011 - stats_glm.t pca() test precision fix. 0.5.1 04/16/2011 - ***GLM pca() return value names updated*** to be more compatible with the literature. - GLM added plot_scores(); plot_scree() name changed to plot_screes() for consistency across plotting functions. - Updated plotting functions to default to png in Windows because GW doesn't reopen after close. 0.5.0 04/02/2011 - Updated tests to use __DATA__ instead of separate test data files. - GLM added interaction_code(); bug fix for dummy_code(). - GLM added ols_rptd()! - GLM added plot_residual(). - GLM eval wrapped optional modules; replaced perl reverse sort with slice (-1:0). - Basic added binomial_test(). - TS updated plot_season() to season_m(); deprecated a few old function names, filt_exp(), filt_ma(), and dsea(). 0.4.3 04/08/2010 - Distr::plot_distr() bug fix. - Kmeans::kmeans() pod update. 0.4.2 03/27/2010 - rtable() added to PDL::Basic. get_data() deprecated. Preparing for transitioning to using rcols() for IO. - perldl online help support! (ie "? stdv" in perldl works) 0.4.1 02/02/2010 - GLM anova anova_rptd IVNM default val matched to []. - Makefile.PL exit 0 without PDL::Lite. - TS added wmape - pod fix for CPAN 0.4.0 01/24/2010 - GLM anova_rptd supports between subject factors! We are now beta! 0.3.2 01/08/2010 - Kmeans kmeans CNTRD checks undef besides PDL->null. opt{V} printout update - Kmeans added which_cluster - Kmeans pca_cluster does scree plot with cutoff line - GLM logistic pod update - GLM anova anova_rptd pca plot_scree plotting options update 0.3.1 12/16/2009 - Removed pms--relying on pdl-stats.sourceforge.net for web documentation instead of CPAN. Minor pod update. - GLM::ols_t removed $y->squeeze, so that edge case like ols_t on 1 obs threads properly (y[1,10]->ols_t(x[1,10]->dummy(1), {const=>0}) gives b[1,10]). - GLM::anova_rptd se fix. - GLM::anova and anova_rptd plot se fix. - PDL::Stats::TS included. ***alpha alpha*** code! Included because I'm tired of maintaining it separately from PDL::Stats. 0.3.0 12/03/2009 - GLM::anova_rptd! works for purely within designs - GLM::anova bug fix--not messing up caller @ if passed @ ref in IV instead of pdl - Change prereq from PDL to PDL::Lite - Stats.pm quick-start guide update - t/stats_glm logistic test modified to use more proper numbers in x 0.2.8 11/04/2009 - Fixed minor bug (mismatched nseed and self cluster pdls. oops) in switching to use cluster assignment as stop criteria. btw initial R2 now based on self instead of seeds. 0.2.7 11/04/2009 - GLM::plot_scree allows directly passing NCOMP. plots cutoff line for screes if requested - kmeans stops when cluster assignment no longer changes instead of using R2CRT. - If not specified, Kmeans::pca_cluster studies scree plot to determine NCOMP Basic::get_data STDERR message fix: data pdl o x v, NOT v x o 0.2.6 10/12/2009 - anova bad value se bug fix - centroid bad value empty cluster bug fix - basic, glm, distr rid of nan and use bad value instead 0.2.5 09/17/2009 - PREREQ_FATAL => 1 in Makefile.PL - added pca_cluster in Kmeans. - anova one-way bug fix. - plot_means one-way bug fix. 0.2.4 09/02/2009 - added corr_table in PDL::Stats::Basic. - PDL::Stats::GLM::pca uses corr_table. - kmeans no longer quits with empty cluster. kmeans 3d threading bug fix. - use short-circuit ISBAD(a) || ISBAD(b) instead of ISGOOD(a) && ISGOOD(b) in bad value processing. 0.2.3 08/24/2009 - ***anova interface change*** IV names now passed in \%opt instead of @ ref. I apologize for the interface change, but it matches the new method plot_means, and makes it easier to pass IVs themselves. I promise that such interface changes will be extremely rare, if at all. - anova now supports bad value in DV. - Added plot_means in PDL::Stats::GLM. anova plots highest order interaction by default. - Added plot_scree in PDL::Stats::GLM. pca does scree plot by default. - Added pca_sorti in PDL::Stats::GLM. - plot_distr in PDL::Stats::Distr accepts WIN option - Set prerequisite PDL => 2.4.4, which has bad value support by default 0.2.2 07/23/2009 - stats_distr.t moved form t/ to Distr/t, ie skip the test if no GSL. - more kmeans fix (4d, seeding). added seeded kmeans tests. 0.2.1 07/22/2009 - kmeans 4d fix. added pms dir for cpan pod parse 0.2.0 07/21/2009 - kmeans threads! ss/ms bug fix. more tests on kmeans added. 0.1.3 07/17/2009 - quick-start for non-PDL people in Stats.pm pod - ols_t uses double internally. GLM pod update - r2_change threads (switched to use ols_t instead of ols) - get_data which_id moved to Basic from GLM - tests grouped in PDL-Stats/t except for PDL::GSL::CDF 0.1.2 07/15/2009 - Cleaned up some versioning stuff in Stats.pm - GLM::get_data TYPE default to double. MISSN handling update; set to BAD by default. - Case insensitive option passing. 0.1.1 07/14/2009 - Fixed PDL::Stats::GLM::ols b_p value for b_t < 0. 0.1.0 07/13/2009 - First version, released on an unsuspecting world. PDL-Stats-0.855/lib/0000755000175000017500000000000014762214364013753 5ustar osboxesosboxesPDL-Stats-0.855/lib/PDL/0000755000175000017500000000000014762214364014372 5ustar osboxesosboxesPDL-Stats-0.855/lib/PDL/Stats.pm0000644000175000017500000001662614762214261016035 0ustar osboxesosboxespackage PDL::Stats; use strict; use warnings; our $VERSION = '0.855'; sub import { my $pkg = (caller())[0]; my $cdf = eval { require PDL::Core; require PDL::GSL::CDF; 1 }; my $distr = eval { require PDL::Core; require PDL::Stats::Distr; 1 }; my $use = <<"EOD"; package $pkg; use PDL::Stats::Basic; use PDL::Stats::GLM; use PDL::Stats::Kmeans; use PDL::Stats::TS; @{[ $distr ? 'use PDL::Stats::Distr;' : '' ]} @{[ $cdf ? 'use PDL::GSL::CDF;' : '' ]} EOD eval $use; die $@ if $@; } =head1 NAME PDL::Stats - a collection of statistics modules in Perl Data Language, with a quick-start guide for non-PDL people. =head1 DESCRIPTION Loads modules named below, making the functions available in the current namespace. Properly formatted documentations online at http://pdl-stats.sf.net =head1 SYNOPSIS use PDL::LiteF; # loads fewer modules use PDL::Stats; # Is equivalent to the following: use PDL::Stats::Basic; use PDL::Stats::GLM; use PDL::Stats::Kmeans; use PDL::Stats::TS; # and the following if installed; use PDL::Stats::Distr; use PDL::GSL::CDF; =head1 QUICK-START FOR NON-PDL PEOPLE Enjoy PDL::Stats without having to dive into PDL, just wet your feet a little. Three key words two concepts and an icing on the cake, you should be well on your way there. =head2 pdl The magic word that puts PDL::Stats at your disposal. pdl creates a PDL numeric data object (a pdl, or ndarray) from perl array or array ref. All PDL::Stats methods, unless meant for regular perl array, can then be called from the data object. my @y = 0..5; my $y = pdl @y; # a simple function my $stdv = $y->stdv; # you can skip the intermediate $y my $stdv = stdv( pdl @y ); # a more complex method, skipping intermediate $y my @x1 = qw( y y y n n n ); my @x2 = qw( 1 0 1 0 1 0 ) # do a two-way analysis of variance with y as DV and x1 x2 as IVs my %result = pdl(@y)->anova( \@x1, \@x2 ); print "$_\t$result{$_}\n" for sort keys %result; If you have a list of list, ie array of array refs, pdl will create a multi-dimensional data object. my @a = ( [1,2,3,4], [0,1,2,3], [4,5,6,7] ); my $a = pdl @a; print $a . $a->info; # here's what you will get [ [1 2 3 4] [0 1 2 3] [4 5 6 7] ] PDL: Double D [4,3] PDL::Stats puts observations in the first dimension and variables in the second dimension, ie pdl [obs, var]. In PDL::Stats the above example represents 4 observations on 3 variables. # you can do all kinds of fancy stuff on such a 2D pdl. my %result = $a->kmeans( {NCLUS=>2} ); print "$_\t$result{$_}\n" for sort keys %result; Make sure the array of array refs is rectangular. If the array refs are of unequal sizes, pdl will pad it out with 0s to match the longest list. =head2 info Tells you the data type (yes pdls are typed, but you shouldn't have to worry about it here*) and dimensionality of the pdl, as seen in the above example. I find it a big help for my sanity to keep track of the dimensionality of a pdl. As mentioned above, PDL::Stats uses 2D pdl with observation x variable dimensionality. *pdl uses double precision by default. If you are working with things like epoch time, then you should probably use pdl(long, @epoch) to maintain the precision. =head2 list Come back to the perl reality from the PDL wonder land. list turns a pdl data object into a regular perl list. Caveat: list produces a flat list. The dimensionality of the data object is lost. =head2 Signature This is not a function, but a concept. You will see something like this frequently in the pod: stdv Signature: (a(n); [o]b()) The signature tells you what the function expects as input and what kind of output it produces. a(n) means it expects a 1D pdl with n elements; [o] is for output, b() means its a scalar. So stdv will take your 1D list and give back a scalar. The name a or b or c is not important. What's important is the thing in the parenthesis. corr Signature: (a(n); b(n); [o]c()) Here the function corr takes two inputs, two 1D pdl with the same numbers of elements, and gives back a scalar. t_test Signature: (a(n); b(m); [o]t(); [o]d()) Here the function t_test can take two 1D pdls of unequal size (n==m is certainly fine), and give back two scalars, t-value and degrees of freedom. Yes we accommodate t-tests with unequal sample sizes. assign Signature: (data(o,v); centroid(c,v); byte [o]cluster(o,c)) Here is one of the most complicated signatures in the package. This is a function from Kmeans. assign takes data of observation x variable dimensions, and a centroid of cluster x variable dimensions, and returns an observation x cluster membership pdl (indicated by 1s and 0s). Got the idea? Then we can see how PDL does its magic :) =head2 Broadcasting Another concept. The first thing to know is that, broadcasting is optional. PDL broadcasting means automatically repeating the operation on extra elements or dimensions fed to a function. For a function with a signature like this gsl_cdf_tdist_P Signature: (double x(); double nu(); [o]out()) the signatures says that it takes two scalars as input, and returns a scalar as output. If you need to look up the p-values for a list of t's, with the same degrees of freedom 19, my @t = ( 1.65, 1.96, 2.56 ); my $p = gsl_cdf_tdist_P( pdl(@t), 19 ); print $p . "\n" . $p->info; # here's what you will get [0.94231136 0.96758551 0.99042586] PDL: Double D [3] The same function is repeated on each element in the list you provided. If you had different degrees of freedoms for the t's, my @df = (199, 39, 19); my $p = gsl_cdf_tdist_P( pdl(@t), pdl(@df) ); print $p . "\n" . $p->info; # here's what you will get [0.94973979 0.97141553 0.99042586] PDL: Double D [3] The df's are automatically matched with the t's to give you the results. An example of broadcasting over extra dimension(s): stdv Signature: (a(n); [o]b()) if the input is of 2D, say you want to compute the stdv for each of the 3 variables, my @a = ( [1,1,3,4], [0,1,2,3], [4,5,6,7] ); # pdl @a is pdl dim [4,3] my $sd = stdv( pdl @a ); print $sd . "\n" . $sd->info; # this is what you will get [ 1.2990381 1.118034 1.118034] PDL: Double D [3] Here the function was given an input with an extra dimension of size 3, so it repeats the stdv operation on the extra dimension 3 times, and gives back a 1D pdl of size 3. Broadcasting works for arbitrary number of dimensions, but it's best to refrain from higher dim pdls unless you have already decided to become a PDL wiz / witch. Not all PDL::Stats methods broadcast. As a rule of thumb, if a function has a signature attached to it, it broadcasts. =head2 perldl Essentially a perl shell with "use PDL;" at start up. Comes with the PDL installation. Very handy to try out pdl operations, or just plain perl. print is shortened to p to avoid injury from excessive typing. C goes out of scope at the end of (multi)line input, so mostly you will have to drop the good practice of my here. =head2 For more info L =head1 AUTHOR ~~~~~~~~~~~~ ~~~~~ ~~~~~~~~ ~~~~~ ~~~ `` ><((("> Copyright (C) 2009-2015 Maggie J. Xiong All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution. =cut 1; PDL-Stats-0.855/lib/PDL/Stats/0000755000175000017500000000000014762214364015470 5ustar osboxesosboxesPDL-Stats-0.855/lib/PDL/Stats/GLM.pd0000644000175000017500000021177414757257632016461 0ustar osboxesosboxesuse strict; use warnings; my $F = [qw(F D)]; pp_add_exported(qw(ols_t ols ols_rptd anova anova_rptd anova_design_matrix dummy_code effect_code effect_code_w interaction_code r2_change logistic pca pca_sorti plot_means plot_residuals plot_screes )); pp_addpm({At=>'Top'}, <<'EOD'); use strict; use warnings; use Carp; use PDL::LiteF; use PDL::MatrixOps; use PDL::Stats::Basic; use PDL::Stats::Kmeans; eval { require PDL::Core; require PDL::GSL::CDF; }; my $CDF = 1 if !$@; =encoding utf8 =head1 NAME PDL::Stats::GLM -- general and generalized linear modelling methods such as ANOVA, linear regression, PCA, and logistic regression. =head1 SYNOPSIS use PDL::LiteF; use PDL::Stats::GLM; # do a multiple linear regression and plot the residuals my $y = pdl( 8, 7, 7, 0, 2, 5, 0 ); my $x = pdl( [ 0, 1, 2, 3, 4, 5, 6 ], # linear component [ 0, 1, 4, 9, 16, 25, 36 ] ); # quadratic component my %m = $y->ols( $x, {plot=>1} ); print "$_\t$m{$_}\n" for sort keys %m; =head1 DESCRIPTION For more about general linear modelling, see L. For an unbelievably thorough text on experimental design and analysis, including linear modelling, see L. The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are broadcastable and methods that are NOT broadcastable, respectively. FUNCTIONS support bad values. P-values, where appropriate, are provided if PDL::GSL::CDF is installed. =cut EOD pp_addhdr(' #include #include #include ' ); pp_def('fill_m', Pars => 'a(n); [o]b(n)', Inplace => 1, GenericTypes => $F, HandleBad => 1, Code => ' broadcastloop %{ PDL_IF_BAD($GENERIC(b) sa = 0; PDL_Indx N = 0; loop (n) %{ if ($ISBAD($a())) continue; sa += $a(); N++; %} $GENERIC(b) m = N ? sa / N : 0;,) loop (n) %{ $b() = PDL_IF_BAD($ISBAD($a()) ? m :,) $a(); %} %} $PDLSTATESETGOOD(b); ', Doc => ' =for ref Replaces bad values with sample mean. Mean is set to 0 if all obs are bad. =for usage pdl> p $data [ [ 5 BAD 2 BAD] [ 7 3 7 BAD] ] pdl> p $data->fill_m [ [ 5 3.5 2 3.5] [ 7 3 7 5.66667] ] ', BadDoc => 'The output pdl badflag is cleared.', ); pp_def('fill_rand', Pars => 'a(n); [o]b(n)', Inplace => 1, HandleBad => 1, Code => ' broadcastloop %{ PDL_IF_BAD($GENERIC(a) *g[ $SIZE(n) ]; PDL_Indx i = 0; srand( time( NULL ) ); loop (n) %{ if ( $ISGOOD($a()) ) { g[i++] = &$a(); } %},) loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) { /* XXX-FIXME works on 64bit, but rand() is quite limited */ PDL_Indx j = (PDL_Indx) ((i-1) * (double)(rand()) / (double)(RAND_MAX) + .5); $b() = *g[j]; continue; },) $b() = $a(); %} %} $PDLSTATESETGOOD(b); ', Doc => ' =for ref Replaces bad values with random sample (with replacement) of good observations from the same variable. =for usage pdl> p $data [ [ 5 BAD 2 BAD] [ 7 3 7 BAD] ] pdl> p $data->fill_rand [ [5 2 2 5] [7 3 7 7] ] ', BadDoc => 'The output pdl badflag is cleared.', ); pp_def('dev_m', Pars => 'a(n); [o]b(n)', Inplace => 1, GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(b) sa = 0, m = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue;,) sa += $a(); PDL_IF_BAD(N++;,) %} m = sa / N; loop (n) %{ if ($ISBAD($a())) { $SETBAD(b()); continue; } $b() = $a() - m; %} ', Doc => 'Replaces values with deviations from the mean.', ); pp_def('stddz', Pars => 'a(n); [o]b(n)', Inplace => 1, GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(b) sa = 0, a2 = 0, m = 0, sd = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue;,) sa += $a(); a2 += $a() * $a(); PDL_IF_BAD(N++;,) %} PDL_IF_BAD(if (N < 1) { loop (n) %{ $SETBAD(b()); %} continue; },) m = sa / N; sd = pow( a2/N - m*m, .5 ); loop (n) %{ PDL_IF_BAD(if ($ISBAD(a())) { $SETBAD(b()); continue; },) /* sd? does not work, presumably due to floating point */ $b() = (sd>0)? (($a() - m) / sd) : 0; %} ', Doc => 'Standardize ie replace values with z_scores based on sample standard deviation from the mean (replace with 0s if stdv==0).', ); pp_def('sse', Pars => 'a(n); b(n); [o]c()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(c) ss = 0; loop (n) %{ PDL_IF_BAD(if ( $ISBAD($a()) || $ISBAD($b()) ) continue;,) $GENERIC() diff = $a() - $b(); ss += diff * diff; %} $c() = ss; ', Doc => 'Sum of squared errors between actual and predicted values.', ); pp_def('mse', Pars => 'a(n); b(n); [o]c()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(c) ss = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ( $ISBAD($a()) || $ISBAD($b()) ) continue;,) $GENERIC() diff = $a() - $b(); ss += diff * diff; PDL_IF_BAD(N++;,) %} if (N < 1) { $SETBAD(c()); continue; } $c() = ss/N; ', Doc => 'Mean of squared errors between actual and predicted values, ie variance around predicted value.', ); pp_def('rmse', Pars => 'a(n); b(n); [o]c()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(c) d2 = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ( $ISBAD($a()) || $ISBAD($b()) ) continue;,) $GENERIC() diff = $a() - $b(); d2 += diff * diff; PDL_IF_BAD(N++;,) %} if (N < 1) { $SETBAD(c()); continue; } $c() = sqrt( d2 / N ); ', Doc => 'Root mean squared error, ie stdv around predicted value.', ); pp_def('pred_logistic', Pars => 'a(n,m); b(m); [o]c(n)', GenericTypes => $F, HandleBad => 1, Code => ' loop (n) %{ $GENERIC(c) l = 0; PDL_IF_BAD(PDL_Indx bad = 0;,) loop (m) %{ PDL_IF_BAD(if ($ISBAD($a()) || $ISBAD($b())) { bad = 1; continue; },) l += $a() * $b(); %} PDL_IF_BAD(if (bad) { $SETBAD($c()); continue; },) $c() = 1 / ( 1 + exp(-l) ); %} ', Doc => ' =for ref Calculates predicted prob value for logistic regression. =for usage # glue constant then apply coeff returned by the logistic method $pred = $x->glue(1,ones($x->dim(0)))->pred_logistic( $m{b} ); ', ); pp_def('d0', Pars => 'a(n); [o]c()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(c) p = 0, ll = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue;,) p += $a(); PDL_IF_BAD(N++;,) %} if (N < 1) { $SETBAD(c()); continue; } p /= N; loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue;,) ll += $a()? log( p ) : log( 1 - p ); %} $c() = -2 * ll; ', Doc => 'Null deviance for logistic regression.', ); pp_def('dm', Pars => 'a(n); b(n); [o]c()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(c) ll; ll = 0; loop (n) %{ PDL_IF_BAD(if ($ISBAD($a()) || $ISBAD($b())) continue;,) ll += $a()? log( $b() ) : log( 1 - $b() ); %} $c() = -2 * ll; ', Doc => ' =for ref Model deviance for logistic regression. =for usage my $dm = $y->dm( $y_pred ); # null deviance my $d0 = $y->dm( ones($y->nelem) * $y->avg ); ', ); pp_def('dvrs', Pars => 'a(); b(); [o]c()', GenericTypes => $F, HandleBad => 1, Code => ' PDL_IF_BAD(if ($ISBAD($a()) || $ISBAD($b())) { $SETBAD( $c() ); continue; },) $c() = $a()? sqrt( -2 * log($b()) ) : -1 * sqrt( -2 * log(1-$b()) ) ; ', Doc => 'Deviance residual for logistic regression.', ); pp_addpm pp_line_numbers(__LINE__, <<'EOD'); =head2 ols_t =for ref Broadcasted version of ordinary least squares regression (B). The price of broadcasting was losing significance tests for coefficients (but see B). The fitting function was shamelessly copied then modified from PDL::Fit::Linfit. Intercept is FIRST of coeff if CONST => 1. ols_t does not handle bad values. consider B or B if there are bad values. =for options Default options (case insensitive): CONST => 1, =for usage Usage: # DV, 2 person's ratings for top-10 box office movies # ascending sorted by box office numbers pdl> p $y = pdl '1 1 2 4 4 4 4 5 5 5; 1 2 2 2 3 3 3 3 5 5' # model with 2 IVs, a linear and a quadratic trend component pdl> $x = cat sequence(10), sequence(10)**2 # suppose our novice modeler thinks this creates 3 different models # for predicting movie ratings pdl> p $x = cat $x, $x * 2, $x * 3 [ [ [ 0 1 2 3 4 5 6 7 8 9] [ 0 1 4 9 16 25 36 49 64 81] ] [ [ 0 2 4 6 8 10 12 14 16 18] [ 0 2 8 18 32 50 72 98 128 162] ] [ [ 0 3 6 9 12 15 18 21 24 27] [ 0 3 12 27 48 75 108 147 192 243] ] ] pdl> p $x->info PDL: Double D [10,2,3] # insert a dummy dim between IV and the dim (model) to be broadcasted pdl> %m = $y->ols_t( $x->dummy(2) ) pdl> p "$_\t@{[$m{$_} =~ /^\n*(.*?)\n*\z/s]}\n" for sort keys %m # 2 persons' ratings, each fitted with 3 "different" models F [ [ 38.314159 25.087209] [ 38.314159 25.087209] [ 38.314159 25.087209] ] # df is the same across dv and iv models F_df [2 7] F_p [ [0.00016967051 0.00064215074] [0.00016967051 0.00064215074] [0.00016967051 0.00064215074] ] R2 [ [ 0.9162963 0.87756762] [ 0.9162963 0.87756762] [ 0.9162963 0.87756762] ] b [ # constant linear quadratic [ [ 0.66363636 0.99015152 -0.056818182] # person 1 [ 1.4 0.18939394 0.022727273] # person 2 ] [ [ 0.66363636 0.49507576 -0.028409091] [ 1.4 0.09469697 0.011363636] ] [ [ 0.66363636 0.33005051 -0.018939394] [ 1.4 0.063131313 0.0075757576] ] ] # our novice modeler realizes at this point that # the 3 models only differ in the scaling of the IV coefficients ss_model [ [ 20.616667 13.075758] [ 20.616667 13.075758] [ 20.616667 13.075758] ] ss_residual [ [ 1.8833333 1.8242424] [ 1.8833333 1.8242424] [ 1.8833333 1.8242424] ] ss_total [22.5 14.9] y_pred [ [ [0.66363636 1.5969697 2.4166667 3.1227273 ... 4.9727273] ... =cut *ols_t = \&PDL::ols_t; sub PDL::ols_t { _ols_common(1, @_); } sub _ols_common { my ($broadcasted, $y, $ivs, $opt) = @_; ($y, $ivs) = _ols_prep_inputs(@_); _ols_main($broadcasted, $y, $ivs, $opt); } sub _ols_prep_inputs { # y [n], ivs [n x attr] pdl my ($broadcasted, $y, $ivs, $opt) = @_; my %opt = ( CONST => 1, PLOT => 0, WIN => undef, # for plotting ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } if (!$broadcasted) { $y = $y->squeeze; $y->getndims > 1 and croak "use ols_t for broadcasted version"; } $ivs = $ivs->dummy(1) if $ivs->getndims == 1; ($y, $ivs) = _rm_bad_value( $y, $ivs ) if !$broadcasted; # set up ivs and const as ivs $opt{CONST} and $ivs = ones($ivs->dim(0))->glue( 1, $ivs ); ($y, $ivs); } sub _ols_main { my ($broadcasted, $y, $ivs, $opt) = @_; my %opt = ( CONST => 1, PLOT => 0, WIN => undef, # for plotting ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } my $C = inv( $ivs x $ivs->t ); # Internally normalise data # (double) it or ushort y and sequence iv won't work right my $ymean = $y->abs->avgover->double; $ymean->where( $ymean==0 ) .= 1; my $divisor = $broadcasted ? $ymean->dummy(0) : $ymean; my $y2 = $y / $divisor; my $Y = $ivs x $y2->dummy(0); # Do the fit # Fitted coefficients vector my $coeff = PDL::squeeze( $C x $Y ); $coeff = $coeff->dummy(0) if $broadcasted and $coeff->getndims == 1 and $y->getndims > 1; $coeff *= $divisor; # Un-normalise # ***$coeff x $ivs looks nice but produces nan on successive tries*** my $y_pred = sumover( ($broadcasted ? $coeff->dummy(1) : $coeff) * $ivs->transpose ); $opt{PLOT} and $y->plot_residuals( $y_pred, \%opt ); return $coeff unless wantarray; my %ret = (y_pred => $y_pred); $ret{ss_total} = $opt{CONST} ? $y->ss : sumover( $y ** 2 ); $ret{ss_residual} = $y->sse( $ret{y_pred} ); $ret{ss_model} = $ret{ss_total} - $ret{ss_residual}; $ret{R2} = $ret{ss_model} / $ret{ss_total}; my $n_var = $opt{CONST} ? $ivs->dim(1) - 1 : $ivs->dim(1); $ret{F_df} = pdl( $n_var, my $df1 = $y->dim(0) - $ivs->dim(1) ); $ret{F} = $ret{ss_model} / $n_var / ($ret{ss_residual} / $df1); $ret{F_p} = 1 - $ret{F}->gsl_cdf_fdist_P( $n_var, $df1 ) if $CDF; if (!$broadcasted) { my $se_b = ones( $coeff->dims? $coeff->dims : 1 ); $opt{CONST} and $se_b->slice(0) .= sqrt( $ret{ss_residual} / $df1 * $C->slice(0,0) ); # get the se for bs by successively regressing each iv by the rest ivs if ($ivs->dim(1) > 1) { my @coords = $opt{CONST} ? 1..$n_var : 0..$n_var-1; my $ones = !$opt{CONST} ? undef : ones($ivs->dim(0)); for my $k (@coords) { my $G = $ivs->dice_axis(1, [grep $_ != $k, @coords]); $G = $ones->glue( 1, $G ) if $opt{CONST}; my $b_G = $ivs->slice(':',$k)->ols( $G, {CONST=>0,PLOT=>0} ); my $ss_res_k = $ivs->slice(':',$k)->squeeze->sse( sumover($b_G * $G->transpose) ); $se_b->slice($k) .= sqrt( $ret{ss_residual} / $df1 / $ss_res_k ); } } else { $se_b->slice(0) .= sqrt( $ret{ss_residual} / $df1 / sum( $ivs->slice(':',0)**2 ) ); } $ret{b_se} = $se_b; $ret{b_t} = $coeff / $ret{b_se}; $ret{b_p} = 2 * ( 1 - $ret{b_t}->abs->gsl_cdf_tdist_P( $df1 ) ) if $CDF; } for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze }; $ret{b} = $coeff; return %ret; } =head2 r2_change =for ref Significance test for the incremental change in R2 when new variable(s) are added to an ols regression model. Returns the change stats as well as stats for both models. Based on L. (One way to make up for the lack of significance tests for coeffs in ols_t). =for options Default options (case insensitive): CONST => 1, =for usage Usage: # suppose these are two persons' ratings for top 10 box office movies # ascending sorted by box office pdl> p $y = qsort ceil(random(10, 2) * 5) [ [1 1 2 2 2 3 4 4 4 4] [1 2 2 3 3 3 4 4 5 5] ] # first IV is a simple linear trend pdl> p $x1 = sequence 10 [0 1 2 3 4 5 6 7 8 9] # the modeler wonders if adding a quadratic trend improves the fit pdl> p $x2 = sequence(10) ** 2 [0 1 4 9 16 25 36 49 64 81] # two difference models are given in two pdls # each as would be pass on to ols_t # the 1st model includes only linear trend # the 2nd model includes linear and quadratic trends # when necessary use dummy dim so both models have the same ndims pdl> %c = $y->r2_change( $x1->dummy(1), cat($x1, $x2) ) pdl> p "$_\t$c{$_}\n" for sort keys %c # person 1 person 2 F_change [0.72164948 0.071283096] # df same for both persons F_df [1 7] F_p [0.42370145 0.79717232] R2_change [0.0085966043 0.00048562549] model0 HASH(0x8c10828) model1 HASH(0x8c135c8) # the answer here is no. =cut *r2_change = \&PDL::r2_change; sub PDL::r2_change { my ($self, $ivs0, $ivs1, $opt) = @_; $ivs0->getndims == 1 and $ivs0 = $ivs0->dummy(1); my %ret; $ret{model0} = { $self->ols_t( $ivs0, $opt ) }; $ret{model1} = { $self->ols_t( $ivs1, $opt ) }; $ret{R2_change} = $ret{model1}->{R2} - $ret{model0}->{R2}; $ret{F_df} = pdl(my $df0 = $ivs1->dim(1) - $ivs0->dim(1), my $df1 = $ret{model1}->{F_df}->slice('(1)') ); $ret{F_change} = $ret{R2_change} * $df1 / ( (1-$ret{model1}->{R2}) * $df0 ); $ret{F_p} = 1 - $ret{F_change}->gsl_cdf_fdist_P( $df0, $df1 ) if $CDF; for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze }; %ret; } =head1 METHODS =head2 anova =for ref Analysis of variance. Uses type III sum of squares for unbalanced data. Dependent variable should be a 1D pdl. Independent variables can be passed as 1D perl array ref or 1D pdl. Will only calculate p-value (C) if there are more samples than the product of categories of all the IVs. Supports bad value (by ignoring missing or BAD values in dependent and independent variables list-wise). For more on ANOVA, see L. =for options Default options (case insensitive): V => 1, # carps if bad value in variables IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] PLOT => 0, # plots highest order effect # can set plot_means options here WIN => undef, # for plotting =for usage Usage: # suppose this is ratings for 12 apples pdl> p $y = qsort ceil( random(12)*5 ) [1 1 2 2 2 3 3 4 4 4 5 5] # IV for types of apple pdl> p $a = sequence(12) % 3 + 1 [1 2 3 1 2 3 1 2 3 1 2 3] # IV for whether we baked the apple pdl> @b = qw( y y y y y y n n n n n n ) pdl> %m = $y->anova( $a, \@b, { IVNM=>['apple', 'bake'] } ) pdl> p "$_\t@{[$m{$_} =~ /^\n*(.*?)\n*\z/s]}\n" for sort keys %m F 2.46666666666667 F_df [5 6] F_p 0.151168719948632 ms_model 3.08333333333333 ms_residual 1.25 ss_model 15.4166666666667 ss_residual 7.5 ss_total 22.9166666666667 | apple | F 0.466666666666667 | apple | F_p 0.648078345471096 | apple | df 2 | apple | m [2.75 3 3.5] | apple | ms 0.583333333333334 | apple | se [0.85391256 0.81649658 0.64549722] | apple | ss 1.16666666666667 | apple || err df 6 | apple ~ bake | F 0.0666666666666671 | apple ~ bake | F_p 0.936190104380701 | apple ~ bake | df 2 | apple ~ bake | m [ [1.5 2 2.5] [ 4 4 4.5] ] | apple ~ bake | ms 0.0833333333333339 | apple ~ bake | se [ [0.5 1 0.5] [ 1 1 0.5] ] | apple ~ bake | ss 0.166666666666668 | apple ~ bake || err df 6 | bake | F 11.2666666666667 | bake | F_p 0.015294126084452 | bake | df 1 | bake | m [2 4.1666667] | bake | ms 14.0833333333333 | bake | se [0.36514837 0.40138649] | bake | ss 14.0833333333333 | bake || err df 6 This is implemented as a call to L, with an C subjects vector. =cut *anova = \&PDL::anova; sub PDL::anova { my ($y, @args) = @_; anova_rptd($y, undef, @args); } sub _interactions { my ($ivs_ref, $idv) = @_; my (@inter, @idv_inter); for my $nway ( 2 .. @$ivs_ref ) { my $iter_idv = _combinations( $nway, [0..$#$ivs_ref] ); while ( my @v = &$iter_idv() ) { push @inter, interaction_code(@$ivs_ref[@v]); push @idv_inter, join ' ~ ', @$idv[@v]; } } (\@inter, \@idv_inter); } # now prepare for cell mean sub _interactions_cm { my ($ivs_ref, $pdl_ivs_raw) = @_; my ($dim0, @inter_cm, @inter_cmo) = $ivs_ref->[0]->dim(0); for my $nway ( 2 .. @$ivs_ref ) { my $iter_idv = _combinations( $nway, [0..$#$ivs_ref] ); while ( my @v = &$iter_idv() ) { my @i_cm; for my $o ( 0 .. $dim0 - 1 ) { push @i_cm, join '', map $_->slice("($o)"), @$pdl_ivs_raw[@v]; } my ($inter, $map) = effect_code( \@i_cm ); push @inter_cm, $inter; # get the order to put means in correct multi dim pdl pos # this is order in var_e dim(1) my @levels = sort { $map->{$a} <=> $map->{$b} } keys %$map; # this is order needed for cell mean my @i_cmo = sort { reverse($levels[$a]) cmp reverse($levels[$b]) } 0 .. $#levels; push @inter_cmo, pdl @i_cmo; } } (\@inter_cmo, \@inter_cm); } sub _cell_means { my ($data, $ivs_cm_ref, $i_cmo_ref, $idv, $pdl_ivs_raw) = @_; my %ind_id; @ind_id{ @$idv } = 0..$#$idv; my %cm; my $i = 0; for (@$ivs_cm_ref) { confess "_cell_means passed empty ivs_cm_ref ndarray at pos $i" if $_->isempty; my $last = zeroes $_->dim(0); my $i_neg = which $_->slice(':',0) == -1; $last->slice($i_neg) .= 1; $_->where($_ == -1) .= 0; $_ = $_->glue(1, $last); my @v = split ' ~ ', $idv->[$i]; my @shape = map $pdl_ivs_raw->[$_]->uniq->nelem, @ind_id{@v}; my ($m, $ss) = $data->centroid( $_ ); $m = $m->slice($i_cmo_ref->[$i])->sever; $ss = $ss->slice($i_cmo_ref->[$i])->sever; $m = $m->reshape(@shape); my $se = sqrt( ($ss/($_->sumover - 1)) / $_->sumover )->reshape(@shape); $cm{ "| $idv->[$i] | m" } = $m; $cm{ "| $idv->[$i] | se" } = $se; $i++; } \%cm; } # http://www.perlmonks.org/?node_id=371228 sub _combinations { my ($num, $arr) = @_; return sub { return } if $num == 0 or $num > @$arr; my @pick; return sub { return @$arr[ @pick = ( 0 .. $num - 1 ) ] unless @pick; my $i = $#pick; $i-- until $i < 0 or $pick[$i]++ < @$arr - $num + $i; return if $i < 0; @pick[$i .. $#pick] = $pick[$i] .. $#$arr; return @$arr[@pick]; }; } =head2 anova_rptd =for ref Repeated measures and mixed model anova. Uses type III sum of squares. The standard error (se) for the means are based on the relevant mean squared error from the anova, ie it is pooled across levels of the effect. Will only calculate p-value (C) if there are more samples than the product of categories of all the IVs. Uses L, so supports bad values. For more on repeated measures ANOVA, see L, and for mixed models, see L. =for options Default options (case insensitive): V => 1, # carps if bad value in dv IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] BTWN => [], # indices of between-subject IVs (matches IVNM indices) PLOT => 0, # plots highest order effect # see plot_means() for more options WIN => undef, # for plotting =for usage Usage: Some fictional data: recall_w_beer_and_wings.txt Subject Beer Wings Recall Alex 1 1 8 Alex 1 2 9 Alex 1 3 12 Alex 2 1 7 Alex 2 2 9 Alex 2 3 12 Brian 1 1 12 Brian 1 2 13 Brian 1 3 14 Brian 2 1 9 Brian 2 2 8 Brian 2 3 14 ... # rtable allows text only in 1st row and col my ($data, $idv, $subj) = rtable 'recall_w_beer_and_wings.txt'; my ($b, $w, $dv) = $data->dog; # subj and IVs can be 1d pdl or @ ref # subj must be the first argument my %m = $dv->anova_rptd( $subj, $b, $w, {ivnm=>['Beer', 'Wings']} ); print "$_\t@{[$m{$_} =~ /^\n*(.*?)\n*\z/s]}\n" for sort keys %m ss_residual 19.0833333333333 ss_subject 24.8333333333333 ss_total 133.833333333333 | Beer | F 9.39130434782609 | Beer | F_p 0.0547977008378944 | Beer | df 1 | Beer | m [10.916667 8.9166667] | Beer | ms 24 | Beer | se [0.4614791 0.4614791] | Beer | ss 24 | Beer || err df 3 | Beer || err ms 2.55555555555556 | Beer || err ss 7.66666666666667 | Beer ~ Wings | F 0.510917030567687 | Beer ~ Wings | F_p 0.623881438624431 | Beer ~ Wings | df 2 | Beer ~ Wings | m [ [ 10 7] [ 10.5 9.25] [12.25 10.5] ] | Beer ~ Wings | ms 1.625 | Beer ~ Wings | se [ [0.89170561 0.89170561] [0.89170561 0.89170561] [0.89170561 0.89170561] ] | Beer ~ Wings | ss 3.25000000000001 | Beer ~ Wings || err df 6 | Beer ~ Wings || err ms 3.18055555555555 | Beer ~ Wings || err ss 19.0833333333333 | Wings | F 4.52851711026616 | Wings | F_p 0.0632754786153548 | Wings | df 2 | Wings | m [8.5 9.875 11.375] | Wings | ms 16.5416666666667 | Wings | se [0.67571978 0.67571978 0.67571978] | Wings | ss 33.0833333333333 | Wings || err df 6 | Wings || err ms 3.65277777777778 | Wings || err ss 21.9166666666667 For mixed model anova, ie when there are between-subject IVs involved, feed the IVs as above, but specify in BTWN which IVs are between-subject. For example, if we had added age as a between-subject IV in the above example, we would do my %m = $dv->anova_rptd( $subj, $age, $b, $w, { ivnm=>['Age', 'Beer', 'Wings'], btwn=>[0] }); =cut *anova_rptd = \&PDL::anova_rptd; sub PDL::anova_rptd { my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef; my ($y, $subj, @ivs_raw) = @_; confess "Expected 1-D data, instead: ", $y->info if $y->ndims != 1; croak "Mismatched number of elements in DV and IV. Are you passing IVs the old-and-abandoned way?" if (ref $ivs_raw[0] eq 'ARRAY') and (@{ $ivs_raw[0] } != $y->nelem); for (@ivs_raw) { croak "too many dims in IV!" if ref $_ eq 'PDL' and $_->squeeze->ndims > 1 } my %opt = ( V => 1, # carps if bad value in dv IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] ( !defined($subj) ? () : ( BTWN => [], # indices of between-subject IVs (matches IVNM indices) )), PLOT => 0, # plots highest order effect WIN => undef, # for plotting ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } $opt{IVNM} = [ map { "IV_$_" } 0 .. $#ivs_raw ] if !$opt{IVNM} or !@{ $opt{IVNM} }; (my ($dsgn, $idv), $y, my ($sj, $ivs_ref_filtrd, $pdl_ivs_raw, $ivs_ref, $err_ref)) = $y->anova_design_matrix($subj, @ivs_raw, \%opt); confess "anova_rptd: got singular matrix for X' x X" if any +($dsgn->t x $dsgn)->det == 0; my $b_full = _ols_main(1, $y, $dsgn->t, {CONST=>0}); my %ret = (ss_total => $y->ss, ss_residual => $y->sse( sumover( $b_full * $dsgn ) )); if (defined $subj) { my @full = (@$ivs_ref, @$err_ref); my $has_btwn = @{ $opt{BTWN} }; my @is_btwn; $is_btwn[$_] = 1 for @{ $opt{BTWN} }; my @within_inds = 0 .. $#ivs_raw; @within_inds = grep !$is_btwn[$_], @within_inds if $has_btwn; my $within_df = pdl(map $_->dim(1), @full[@within_inds])->prodover->sclr; EFFECT: for my $k (0 .. $#full) { my $e = ($k > $#$ivs_ref)? '| err' : ''; my $i = ($k > $#$ivs_ref)? $k - @$ivs_ref : $k; my $i_pref = $k == $#full ? undef : "| $idv->[$i] |"; if (!defined $full[$k]) { # ss_residual as error $ret{ "$i_pref$e ss" } = $ret{ss_residual}; # highest ord inter for purely within design, (p-1)*(q-1)*(n-1) my $factor = (ref $full[-1] ? $full[-1] : $err_ref->[$full[-1]])->dim(1); my $df = $ret{ "$i_pref$e df" } = $factor * $within_df; die "${i_pref}residual df = 0" if $df <= 0; $ret{ "$i_pref$e ms" } = $ret{ "$i_pref$e ss" } / $df; } elsif (ref $full[$k]) { # unique error term next EFFECT unless my @G = grep $_ != $k && defined $full[$_], 0 .. $#full; my $G = ones($y->dim(0))->glue(1, grep ref $_, @full[@G]); my $b_G = $y->ols_t( $G, {CONST=>0} ); my $ss = $ret{$k == $#full ? 'ss_subject' : "$i_pref$e ss"} = $y->sse(sumover($b_G * $G->transpose)) - $ret{ss_residual}; if ($k != $#full) { my $df = $ret{"$i_pref$e df"} = $full[$k]->dim(1); die "residual df = 0" if $df <= 0; $ret{"$i_pref$e ms"} = $ss / $df; } } else { # repeating error term my $ss = $ret{$k == $#full ? 'ss_subject' : "$i_pref$e ss"} = $ret{"| $idv->[$full[$k]] |$e ss"}; if ($k != $#full) { my $df = $ret{"$i_pref$e df"} = $ret{"| $idv->[$full[$k]] |$e df"}; die "residual df = 0" if $df <= 0; $ret{"$i_pref$e ms"} = $ss / $df; } } } } else { $ret{ss_model} = $ret{ss_total} - $ret{ss_residual}; $ret{F_df} = pdl(my $F_df0 = $dsgn->dim(0) - 1, my $df1 = $y->nelem - $dsgn->dim(0)); $ret{ms_model} = $ret{ss_model} / $F_df0; $ret{ms_residual} = $ret{ss_residual} / $df1; $ret{F} = $ret{ms_model} / $ret{ms_residual}; $ret{F_p} = 1 - $ret{F}->gsl_cdf_fdist_P( $F_df0, $df1 ) if $CDF and $df1 > 0; # get IV ss from $ivs_ref instead of $dsgn pdl my $ones = ones($y->dim(0)); for my $k (0 .. $#$ivs_ref) { my $G = $ones->glue(1, @$ivs_ref[grep $_ != $k, 0 .. $#$ivs_ref]); my $b_G = $y->ols_t( $G, {CONST=>0} ); $ret{ "| $idv->[$k] | ss" } = $y->sse( sumover($b_G * $G->transpose) ) - $ret{ss_residual}; my $df0 = $ret{ "| $idv->[$k] | df" } = $ivs_ref->[$k]->dim(1); $ret{ "| $idv->[$k] || err df" } = $df1; die "residual df = 0" if $df1 <= 0; $ret{ "| $idv->[$k] | ms" } = $ret{ "| $idv->[$k] | ss" } / $df0; } } # have all iv, inter, and error effects. get F and F_p for (0 .. $#$ivs_ref) { my $ms_residual = defined $subj ? $ret{ "| $idv->[$_] || err ms" } : $ret{ms_residual}; my ($df0, $df1) = @ret{"| $idv->[$_] | df" , "| $idv->[$_] || err df"}; my $F = $ret{ "| $idv->[$_] | F" } = $ret{ "| $idv->[$_] | ms" } / $ms_residual; $ret{ "| $idv->[$_] | F_p" } = 1 - $F->gsl_cdf_fdist_P($df0, $df1) if $CDF and $df1 > 0; } for (keys %ret) {ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze}; my ($inter_cmo_ref, $inter_cm_ref) = _interactions_cm($ivs_ref_filtrd, $pdl_ivs_raw); # append inter info to cell means effects my $ivs_cm_ref = [@$ivs_ref_filtrd, @$inter_cm_ref]; my @i_cmo_ref = map pdl(values %{ (effect_code($_))[1] })->qsort, @$pdl_ivs_raw; push @i_cmo_ref, @$inter_cmo_ref; my $cm_ref = _cell_means( $y, $ivs_cm_ref, \@i_cmo_ref, $idv, $pdl_ivs_raw ); if (defined $subj) { my @ls = map { $_->uniq->nelem } @$pdl_ivs_raw; $cm_ref = _fix_rptd_se( $cm_ref, \%ret, $opt{IVNM}, \@ls, $sj->uniq->nelem ); } # integrate mean and se into %ret @ret{ keys %$cm_ref } = values %$cm_ref; my $highest = join(' ~ ', @{ $opt{IVNM} }); $cm_ref->{"| $highest | m"}->plot_means( $cm_ref->{"| $highest | se"}, { %opt, IVNM=>$idv } ) if $opt{PLOT}; %ret; } =head2 anova_design_matrix =for ref Effect-coded design matrix for anova, including repeated-measures and mixed-model. The C for use in linear regression i.e. C. Added in 0.854. See L for more. Supports bad value in the dependent and independent variables. It automatically removes bad data listwise, i.e. remove a subject's data if there is any cell missing for the subject. =for options Default options (case insensitive): V => 1, # carps if bad value in dv IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] BTWN => [], # indices of between-subject IVs (matches IVNM indices) =for usage $matrix = $dv->anova_design_matrix(undef, $b, $w, {ivnm=>[qw(b w)]}); $matrix = $dv->anova_design_matrix( $subj, $b, $w, {ivnm=>[qw(b w)]}); # repeated-measures $matrix = $dv->anova_design_matrix( $subj, $b, $w, {ivnm=>[qw(b w)], btwn=>['b']}); # mixed-model ($matrix, $ivnm_ref) = $dv->anova_design_matrix( $subj, $b, $w, {ivnm=>[qw(b w)], btwn=>['b']}); # list context also names =cut *anova_design_matrix = \&PDL::anova_design_matrix; sub PDL::anova_design_matrix { my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef; my ($y, $subj, @ivs_raw) = @_; confess "No IVs: did you omit 'undef' for anova?" if !@ivs_raw; confess "Expected 1-D data, instead: ", $y->info if $y->ndims != 1; for (@ivs_raw) { croak "too many dims in IV!" if ref $_ eq 'PDL' and $_->squeeze->ndims > 1; } my %opt = ( V => 1, # carps if bad value in dv IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] ( !defined($subj) ? () : ( BTWN => [], # indices of between-subject IVs (matches IVNM indices) )), ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } $opt{IVNM} = [ map { "IV_$_" } 0 .. $#ivs_raw ] if !$opt{IVNM} or !@{ $opt{IVNM} }; my @idv_orig = @{ $opt{IVNM} }; my @pdl_ivs_raw = map scalar PDL::Stats::Basic::code_ivs($_), @ivs_raw; my $pdl_ivs = pdl(\@pdl_ivs_raw); # explicit set badflag because pdl() removes badflag $pdl_ivs->badflag( scalar grep $_->badflag, @pdl_ivs_raw ); my $sj; if (defined($subj)) { # delete bad data listwise ie remove subj if any cell missing $sj = PDL::Stats::Basic::code_ivs($subj); my $ibad = which( $y->isbad | nbadover($pdl_ivs->transpose) ); my $sj_bad = $sj->slice($ibad)->uniq; if ($sj_bad->nelem) { warn $sj_bad->nelem . " subjects with missing data removed\n" if $opt{V}; $sj = $sj->setvaltobad($_) for (list $sj_bad); my $igood = which $sj->isgood; for ($y, $sj, @pdl_ivs_raw) { $_ = $_->slice( $igood )->sever; $_->badflag(0); } } } else { ($y, $pdl_ivs) = _rm_bad_value( $y, $pdl_ivs ); if ($opt{V} and $y->nelem < $pdl_ivs_raw[0]->nelem) { warn sprintf "%d subjects with missing data removed\n", $pdl_ivs_raw[0]->nelem - $y->nelem; } @pdl_ivs_raw = $pdl_ivs->dog; } my @ivs_ref_fltrd = map scalar effect_code($_), @pdl_ivs_raw; my ($ivs_inter_ref, $idv_inter) = _interactions(\@ivs_ref_fltrd, \@idv_orig); # append inter info to main effects my $ivs_ref = [@ivs_ref_fltrd, @$ivs_inter_ref]; my @idv = (@idv_orig, @$idv_inter); # matches $ivs_ref, with an extra last pdl for subj effect my $err_ref = !defined($subj) ? [] : _add_errors( $sj, $ivs_ref, \@idv, \@pdl_ivs_raw, $opt{BTWN} ); for (grep ref $err_ref->[$_], 0..$#$err_ref) { my ($null_row_ids, $non_null_row_ids) = $err_ref->[$_]->zcover->which_both; confess "got null columns $null_row_ids in error entry #$_ ($idv[$_])" if !$null_row_ids->isempty; } my $dsgn = PDL::glue(1, ones($y->dim(0)), @$ivs_ref, (grep ref($_), @$err_ref))->t; !wantarray ? $dsgn : ($dsgn, \@idv, $y, $sj, \@ivs_ref_fltrd, \@pdl_ivs_raw, $ivs_ref, $err_ref); } # code (btwn group) subjects. Rutherford (2011) pp 208-209 sub _code_btwn { my ($subj, $btwn) = @_; my (@grp, %grp_s); for my $n (0 .. $subj->nelem - 1) { # construct string to code group membership # something not treated as BAD by code_ivs to start off marking group membership # if no $btwn, everyone ends up in the same grp my $s = '_' . join '', map $_->slice($n), @$btwn; push @grp, $s; # group membership $s .= $subj->slice($n); # keep track of total uniq subj $grp_s{$s} = 1; } my $grp = PDL::Stats::Kmeans::iv_cluster \@grp; my $spdl = zeroes $subj->dim(0), keys(%grp_s) - $grp->dim(1); my $d1 = 0; for my $g (0 .. $grp->dim(1)-1) { my $col_inds = which $grp->slice(':',$g); my $gsub = $subj->slice( $col_inds )->effect_code; my ($nobs, $nsub) = $gsub->dims; $spdl->slice($col_inds, [$d1,$d1+$nsub-1]) .= $gsub; $d1 += $nsub; } $spdl; } sub _add_errors { my ($subj, $ivs_ref, $idv, $raw_ivs, $btwn) = @_; my $spdl = _code_btwn($subj, [@$raw_ivs[@$btwn]]); # if btwn factor involved, or highest order inter for within factors # elem is undef, so that # @errors ind matches @$ivs_ref, with an extra elem at the end for subj # mark btwn factors for error terms # same error term for B(wn) and A(btwn) x B(wn) (Rutherford, p205) my %is_btwn = map +($_=>1), @$idv[ @$btwn ]; my $has_btwn = keys %is_btwn; my %idv2indx = map +($idv->[$_]=>$_), 0..$#$idv; my $ie_subj; my @errors = map { my @fs = split ' ~ ', $idv->[$_]; # separate bw and wn factors # if only bw, error is bw x subj # if only wn or wn and bw, error is wn x subj my @bw = !$has_btwn ? () : grep $is_btwn{$_}, @fs; my @wn = !$has_btwn ? @fs : grep !$is_btwn{$_}, @fs; $ie_subj = $_ if !defined($ie_subj) and !@wn; my $err = join ' ~ ', @wn ? @wn : @bw; # highest order inter of within factors, use ss_residual as error if ( @wn == @$raw_ivs - @$btwn ) { undef } # repeating btwn factors use ss_subject as error elsif (!@wn and $_ > $ie_subj) { $ie_subj } # repeating error term elsif ($_ > $idv2indx{$err}) { $idv2indx{$err} } elsif (@wn) { interaction_code($ivs_ref->[$_], $spdl) } else { $spdl } } 0 .. $#$ivs_ref; push @errors, $has_btwn ? $ie_subj : $spdl; \@errors; } sub _fix_rptd_se { # if ivnm lvls_ref for within ss only this can work for mixed design my ($cm_ref, $ret, $ivnm, $lvls_ref, $n) = @_; my @se = map /^\| (.+?) \| se$/ ? $1 : (), keys %$cm_ref; my @n_obs = map { my @ivs = split / ~ /, $_; my $i_ivs = which_id $ivnm, \@ivs; my $icollapsed = setops pdl(0 .. $#$ivnm), 'XOR', $i_ivs; my $collapsed = $icollapsed->nelem? pdl( @$lvls_ref[(list $icollapsed)] )->prodover : 1 ; $n * $collapsed; } @se; for my $i (0 .. $#se) { $cm_ref->{"| $se[$i] | se"} .= sqrt( $ret->{"| $se[$i] || err ms"} / $n_obs[$i] ); } $cm_ref; } =head2 dummy_code =for ref Dummy coding of nominal variable (perl @ ref or 1d pdl) for use in regression. Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD). =for usage pdl> @a = qw(a a a b b b c c c) pdl> p $a = dummy_code(\@a) [ [1 1 1 0 0 0 0 0 0] [0 0 0 1 1 1 0 0 0] ] =cut *dummy_code = \&PDL::dummy_code; sub PDL::dummy_code { my ($var_ref) = @_; my $var_e = effect_code( $var_ref ); $var_e->where( $var_e == -1 ) .= 0; $var_e; } =head2 effect_code =for ref Unweighted effect coding of nominal variable (perl @ ref or 1d pdl) for use in regression. returns in @ context coded pdl and % ref to level - pdl->dim(1) index. Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD). =for usage my @var = qw( a a a b b b c c c ); my ($var_e, $map) = effect_code( \@var ); print $var_e . $var_e->info . "\n"; [ [ 1 1 1 0 0 0 -1 -1 -1] [ 0 0 0 1 1 1 -1 -1 -1] ] PDL: Double D [9,2] print "$_\t$map->{$_}\n" for sort keys %$map a 0 b 1 c 2 =cut *effect_code = \&PDL::effect_code; sub PDL::effect_code { my ($var_ref) = @_; my ($var, $map_ref) = PDL::Stats::Basic::code_ivs( $var_ref ); my $var_max = $var->max; confess "effect_code called with only one unique value" if $var_max < 1; my $var_e = yvals( float, $var->nelem, $var_max ) == $var; $var_e->slice(which( $var == $var_max ), ) .= -1; $var_e = $var_e->setbadif( $var->isbad ) if $var->badflag; wantarray ? ($var_e, $map_ref) : $var_e; } =head2 effect_code_w =for ref Weighted effect code for nominal variable. returns in @ context coded pdl and % ref to level - pdl->dim(1) index. Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD). =for usage pdl> @a = qw( a a b b b c c ) pdl> p $a = effect_code_w(\@a) [ [ 1 1 0 0 0 -1 -1] [ 0 0 1 1 1 -1.5 -1.5] ] =cut *effect_code_w = \&PDL::effect_code_w; sub PDL::effect_code_w { my ($var_ref) = @_; my ($var_e, $map_ref) = effect_code( $var_ref ); return wantarray ? ($var_e, $map_ref) : $var_e if $var_e->sum == 0; my $pos = $var_e == 1; my $neg = $var_e == -1; my $w = $pos->sumover / $neg->sumover; my $neg_ind = $neg->whichND; $var_e->indexND($neg_ind) *= $w->slice($neg_ind->slice('(1)')); wantarray ? ($var_e, $map_ref) : $var_e; } =head2 interaction_code Returns the coded interaction term for effect-coded variables. Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD). =for usage pdl> $a = sequence(6) > 2 pdl> p $a = $a->effect_code [ [ 1 1 1 -1 -1 -1] ] pdl> $b = pdl( qw( 0 1 2 0 1 2 ) ) pdl> p $b = $b->effect_code [ [ 1 0 -1 1 0 -1] [ 0 1 -1 0 1 -1] ] pdl> p $ab = interaction_code( $a, $b ) [ [ 1 0 -1 -1 -0 1] [ 0 1 -1 -0 -1 1] ] =cut *interaction_code = \&PDL::interaction_code; sub PDL::interaction_code { my $i = ones( $_[0]->dim(0), 1 ); $i = ($i * $_->dummy(1))->clump(1,2) for @_; $i; } =head2 ols =for ref Ordinary least squares regression, aka linear regression. Unlike B, ols is not broadcastable, but it can handle bad value (by ignoring observations with bad value in dependent or independent variables list-wise) and returns the full model in list context with various stats. IVs ($x) should be pdl dims $y->nelem or $y->nelem x n_iv. Do not supply the constant vector in $x. Intercept is automatically added and returned as FIRST of the coeffs if CONST=>1. Returns full model in list context and coeff in scalar context. For more on multiple linear regression see L. =for options Default options (case insensitive): CONST => 1, PLOT => 0, # see plot_residuals() for plot options WIN => undef, # for plotting =for usage Usage: # suppose this is a person's ratings for top 10 box office movies # ascending sorted by box office pdl> $y = pdl '[1 1 2 2 2 2 4 4 5 5]' # construct IV with linear and quadratic component pdl> p $x = cat sequence(10), sequence(10)**2 [ [ 0 1 2 3 4 5 6 7 8 9] [ 0 1 4 9 16 25 36 49 64 81] ] pdl> %m = $y->ols( $x ) pdl> p "$_\t@{[$m{$_} =~ /^\n*(.*?)\n*\z/s]}\n" for sort keys %m F 40.4225352112676 F_df [2 7] F_p 0.000142834216344756 R2 0.920314253647587 # coeff constant linear quadratic b [0.981818 0.212121 0.030303] b_p [0.039910 0.328001 0.203034] b_se [0.389875 0.201746 0.021579] b_t [2.518284 1.051422 1.404218] ss_model 19.8787878787879 ss_residual 1.72121212121212 ss_total 21.6 y_pred [0.98181818 1.2242424 1.5272727 ... 4.6181818 5.3454545] =cut *ols = \&PDL::ols; sub PDL::ols { _ols_common(0, @_); } # ivs = [nobs x nivs] so can `dog` retval sub _rm_bad_value { my ($y, $ivs) = @_; return ($y, $ivs, undef) if !$y->check_badflag and !$ivs->check_badflag; my $idx = which($y->isgood & (nbadover ($ivs->transpose)==0)); $_ = $_->slice($idx)->sever for $y, $ivs; $_->badflag(0) for $y, $ivs; ($y, $ivs, $idx); } =head2 ols_rptd =for ref Repeated measures linear regression. Handles purely within-subject design for now. (Lorch & Myers, 1990; Van den Noortgate & Onghena, 2006). See F for an example using the Lorch and Myers data. =for usage Usage: # This is the example from Lorch and Myers (1990), # a study on how characteristics of sentences affected reading time # Three within-subject IVs: # SP -- serial position of sentence # WORDS -- number of words in sentence # NEW -- number of new arguments in sentence # $subj can be 1D pdl or @ ref and must be the first argument # IV can be 1D @ ref or pdl # 1D @ ref is effect coded internally into pdl # pdl is left as is my %r = $rt->ols_rptd( $subj, $sp, $words, $new ); print "$_\t$r{$_}\n" for sort keys %r; ss_residual 58.3754646504336 ss_subject 51.8590337714286 ss_total 405.188241771429 # SP WORDS NEW F [ 7.208473 61.354153 1.0243311] F_p [0.025006181 2.619081e-05 0.33792837] coeff [0.33337285 0.45858933 0.15162986] df [1 1 1] df_err [9 9 9] ms [ 18.450705 73.813294 0.57026483] ms_err [ 2.5595857 1.2030692 0.55671923] ss [ 18.450705 73.813294 0.57026483] ss_err [ 23.036272 10.827623 5.0104731] =cut *ols_rptd = \&PDL::ols_rptd; sub PDL::ols_rptd { my ($y, $subj, @ivs_raw) = @_; $y = $y->squeeze; $y->getndims > 1 and croak "ols_rptd does not support broadcasting"; my @ivs = map { (ref $_ eq 'PDL' and $_->ndims > 1)? $_ : ref $_ eq 'PDL' ? $_->dummy(1) : scalar effect_code($_) ; } @ivs_raw; my %r; $r{ss_total} = $y->ss; # STEP 1: subj my $s = effect_code $subj; my $b_s = $y->ols_t($s); my $pred = sumover($b_s->slice('1:-1') * $s->transpose) + $b_s->slice(0); $r{ss_subject} = $r{ss_total} - $y->sse( $pred ); # STEP 2: add predictor variables my $iv_p = $s->glue(1, @ivs); my $b_p = $y->ols_t($iv_p); # only care about coeff for predictor vars. no subj or const coeff $r{coeff} = $b_p->slice([-@ivs,-1])->sever; # get total sse for this step $pred = sumover($b_p->slice('1:-1') * $iv_p->transpose) + $b_p->slice(0); my $ss_pe = $y->sse( $pred ); # get predictor ss by successively reducing the model $r{ss} = zeroes scalar(@ivs); for my $i (0 .. $#ivs) { my $iv = $s->glue(1, @ivs[ grep $_ != $i, 0..$#ivs ]); my $b = $y->ols_t($iv); $pred = sumover($b->slice('1:-1') * $iv->transpose) + $b->slice(0); $r{ss}->slice($i) .= $y->sse($pred) - $ss_pe; } # STEP 3: get predictor x subj interaction as error term my $iv_e = PDL::glue 1, map interaction_code( $s, $_ ), @ivs; # get total sse for this step. full model now. my $b_f = $y->ols_t( $iv_p->glue(1,$iv_e) ); $pred = sumover($b_f->slice('1:-1') * $iv_p->glue(1,$iv_e)->transpose) + $b_f->slice(0); $r{ss_residual} = $y->sse( $pred ); # get predictor x subj ss by successively reducing the error term $r{ss_err} = zeroes scalar(@ivs); for my $i (0 .. $#ivs) { my $iv = $iv_p->glue(1, map interaction_code($s, $_), @ivs[grep $_ != $i, 0..$#ivs]); my $b = $y->ols_t($iv); my $pred = sumover($b->slice('1:-1') * $iv->transpose) + $b->slice(0); $r{ss_err}->slice($i) .= $y->sse($pred) - $r{ss_residual}; } # Finally, get MS, F, etc $r{df} = pdl( map $_->squeeze->ndims, @ivs ); $r{ms} = $r{ss} / $r{df}; $r{df_err} = $s->dim(1) * $r{df}; $r{ms_err} = $r{ss_err} / $r{df_err}; $r{F} = $r{ms} / $r{ms_err}; $r{F_p} = 1 - $r{F}->gsl_cdf_fdist_P( $r{df}, $r{df_err} ) if $CDF; %r; } =head2 logistic =for ref Logistic regression with maximum likelihood estimation using L. IVs ($x) should be pdl dims $y->nelem or $y->nelem x n_iv. Do not supply the constant vector in $x. It is included in the model and returned as LAST of coeff. Returns full model in list context and coeff in scalar context. The significance tests are likelihood ratio tests (-2LL deviance) tests. IV significance is tested by comparing deviances between the reduced model (ie with the IV in question removed) and the full model. ***NOTE: the results here are qualitatively similar to but not identical with results from R, because different algorithms are used for the nonlinear parameter fit. Use with discretion*** =for options Default options (case insensitive): INITP => zeroes( $x->dim(1) + 1 ), # n_iv + 1 MAXIT => 1000, EPS => 1e-7, =for usage Usage: # suppose this is whether a person had rented 10 movies pdl> p $y = ushort( random(10)*2 ) [0 0 0 1 1 0 0 1 1 1] # IV 1 is box office ranking pdl> p $x1 = sequence(10) [0 1 2 3 4 5 6 7 8 9] # IV 2 is whether the movie is action- or chick-flick pdl> p $x2 = sequence(10) % 2 [0 1 0 1 0 1 0 1 0 1] # concatenate the IVs together pdl> p $x = cat $x1, $x2 [ [0 1 2 3 4 5 6 7 8 9] [0 1 0 1 0 1 0 1 0 1] ] pdl> %m = $y->logistic( $x ) pdl> p "$_\t$m{$_}\n" for sort keys %m D0 13.8629436111989 Dm 9.8627829791575 Dm_chisq 4.00016063204141 Dm_df 2 Dm_p 0.135324414081692 # ranking genre constant b [0.41127706 0.53876358 -2.1201285] b_chisq [ 3.5974504 0.16835559 2.8577151] b_p [0.057868258 0.6815774 0.090936587] iter 12 y_pred [0.10715577 0.23683909 ... 0.76316091 0.89284423] # to get the covariance out, supply a true value for the COV option: pdl> %m = $y->logistic( $x, {COV=>1} ) pdl> p $m{cov}; =cut *logistic = \&PDL::logistic; sub PDL::logistic { require PDL::Fit::LM; my ( $self, $ivs, $opt ) = @_; $self = $self->squeeze; # make compatible w multiple var cases $ivs->getndims == 1 and $ivs = $ivs->dummy(1); $self->dim(0) != $ivs->dim(0) and carp "mismatched n btwn DV and IV!"; my %opt = ( INITP => zeroes( $ivs->dim(1) + 1 ), # n_ivs + 1 MAXIT => 1000, EPS => 1e-7, ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } # not using it atm $opt{WT} = 1; # Use lmfit. Fourth input argument is reference to user-defined # copy INITP so we have the original value when needed my ($yfit,$coeff,$cov,$iter) = PDL::Fit::LM::lmfit($ivs, $self, $opt{WT}, \&_logistic, $opt{INITP}->copy, { Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } ); # apparently at least coeff is child of some pdl # which is changed in later lmfit calls $yfit = $yfit->copy; $coeff = $coeff->copy; return $coeff unless wantarray; my %ret; my $n0 = $self->where($self == 0)->nelem; my $n1 = $self->nelem - $n0; $ret{cov} = $cov if $opt{COV}; $ret{D0} = -2*($n0 * log($n0 / $self->nelem) + $n1 * log($n1 / $self->nelem)); $ret{Dm} = sum( $self->dvrs( $yfit ) ** 2 ); $ret{Dm_chisq} = $ret{D0} - $ret{Dm}; $ret{Dm_df} = $ivs->dim(1); $ret{Dm_p} = 1 - PDL::GSL::CDF::gsl_cdf_chisq_P( $ret{Dm_chisq}, $ret{Dm_df} ) if $CDF; my $coeff_chisq = zeroes $opt{INITP}->nelem; if ( $ivs->dim(1) > 1 ) { for my $k (0 .. $ivs->dim(1)-1) { my @G = grep { $_ != $k } (0 .. $ivs->dim(1)-1); my $G = $ivs->dice_axis(1, \@G); my $init = $opt{INITP}->dice([ @G, $opt{INITP}->dim(0)-1 ])->copy; my $y_G = PDL::Fit::LM::lmfit( $G, $self, $opt{WT}, \&_logistic, $init, { Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } ); $coeff_chisq->slice($k) .= $self->dm( $y_G ) - $ret{Dm}; } } else { # d0 is, by definition, the deviance with only intercept $coeff_chisq->slice(0) .= $ret{D0} - $ret{Dm}; } my $y_c = PDL::Fit::LM::lmfit( $ivs, $self, $opt{WT}, \&_logistic_no_intercept, $opt{INITP}->slice('0:-2')->sever, { Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } ); $coeff_chisq->slice(-1) .= $self->dm( $y_c ) - $ret{Dm}; $ret{b} = $coeff; $ret{b_chisq} = $coeff_chisq; $ret{b_p} = 1 - $ret{b_chisq}->gsl_cdf_chisq_P( 1 ) if $CDF; $ret{y_pred} = $yfit; $ret{iter} = $iter; for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze }; %ret; } sub _logistic { my ($x,$par,$ym,$dyda) = @_; # $b and $c are fit parameters slope and intercept my $b = $par->slice([0,$x->dim(1) - 1])->sever; my $c = $par->slice(-1)->sever; # Write function with dependent variable $ym, # independent variable $x, and fit parameters as specified above. # Use the .= (dot equals) assignment operator to express the equality # (not just a plain equals) $ym .= 1 / ( 1 + exp( -1 * (sumover($b * $x->transpose) + $c) ) ); my @dy = map $dyda->slice(",($_)"), 0 .. $par->dim(0)-1; # Partial derivative of the function with respect to each slope # fit parameter ($b in this case). Again, note .= assignment # operator (not just "equals") $dy[$_] .= $x->slice(':',$_) * $ym * (1 - $ym) for (0 .. $b->dim(0)-1); # Partial derivative of the function re intercept par $dy[-1] .= $ym * (1 - $ym); } sub _logistic_no_intercept { my ($x,$par,$ym,$dyda) = @_; my $b = $par->slice([0,$x->dim(1) - 1])->sever; # Write function with dependent variable $ym, # independent variable $x, and fit parameters as specified above. # Use the .= (dot equals) assignment operator to express the equality # (not just a plain equals) $ym .= 1 / ( 1 + exp( -1 * sumover($b * $x->transpose) ) ); my (@dy) = map {$dyda -> slice(",($_)") } (0 .. $par->dim(0)-1); # Partial derivative of the function with respect to each slope # fit parameter ($b in this case). Again, note .= assignment # operator (not just "equals") $dy[$_] .= $x->slice(':',$_) * $ym * (1 - $ym) for 0 .. $b->dim(0)-1; } =head2 pca =for ref Principal component analysis. Based on corr instead of cov. Bad values are ignored pair-wise. OK when bad values are few but otherwise probably should fill_m etc before pca). Uses L. =for options Default options (case insensitive): CORR => 1, # boolean. use correlation or covariance PLOT => 0, # calls plot_screes by default # can set plot_screes options here WIN => undef, # for plotting =for usage Usage: my $d = qsort random 10, 5; # 10 obs on 5 variables my %r = $d->pca( \%opt ); print "$_\t$r{$_}\n" for (keys %r); eigenvalue # variance accounted for by each component [4.70192 0.199604 0.0471421 0.0372981 0.0140346] eigenvector # dim var x comp. weights for mapping variables to component [ [ -0.451251 -0.440696 -0.457628 -0.451491 -0.434618] [ -0.274551 0.582455 0.131494 0.255261 -0.709168] [ 0.43282 0.500662 -0.139209 -0.735144 -0.0467834] [ 0.693634 -0.428171 0.125114 0.128145 -0.550879] [ 0.229202 0.180393 -0.859217 0.4173 0.0503155] ] loadings # dim var x comp. correlation between variable and component [ [ -0.978489 -0.955601 -0.992316 -0.97901 -0.942421] [ -0.122661 0.260224 0.0587476 0.114043 -0.316836] [ 0.0939749 0.108705 -0.0302253 -0.159616 -0.0101577] [ 0.13396 -0.0826915 0.0241629 0.0247483 -0.10639] [ 0.027153 0.0213708 -0.101789 0.0494365 0.00596076] ] pct_var # percent variance accounted for by each component [0.940384 0.0399209 0.00942842 0.00745963 0.00280691] Plot scores along the first two components, $d->plot_scores( $r{eigenvector} ); =cut *pca = \&PDL::pca; sub PDL::pca { my ($self, $opt) = @_; my %opt = ( CORR => 1, PLOT => 0, WIN => undef, # for plotting ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } my $var_var = $opt{CORR}? $self->corr_table : $self->cov_table; # value is axis pdl and score is var x axis my ($eigvec, $eigval) = $var_var->eigens_sym; $eigvec = $eigvec->transpose; # compatibility with PDL::Slatec::eigsys # ind is sticky point for broadcasting my $ind_sorted = $eigval->qsorti->slice('-1:0'); $eigvec = $eigvec->slice(':',$ind_sorted)->sever; $eigval = $eigval->slice($ind_sorted)->sever; # var x axis my $var = $eigval / $eigval->sum->sclr; my $loadings; if ($opt{CORR}) { $loadings = $eigvec * sqrt( $eigval->transpose ); } else { my $scores = $eigvec x $self->dev_m; $loadings = $self->corr( $scores->dummy(1) ); } $var->plot_screes(\%opt) if $opt{PLOT}; ( eigenvalue=>$eigval, eigenvector=>$eigvec, pct_var=>$var, loadings=>$loadings ); } =head2 pca_sorti Determine by which vars a component is best represented. Descending sort vars by size of association with that component. Returns sorted var and relevant component indices. =for options Default options (case insensitive): NCOMP => 10, # maximum number of components to consider =for usage Usage: # let's see if we replicated the Osgood et al. (1957) study pdl> ($data, $idv, $ido) = rtable 'osgood_exp.csv', {v=>0} # select a subset of var to do pca pdl> $ind = which_id $idv, [qw( ACTIVE BASS BRIGHT CALM FAST GOOD HAPPY HARD LARGE HEAVY )] pdl> $data = $data( ,$ind)->sever pdl> @$idv = @$idv[list $ind] pdl> %m = $data->pca pdl> ($iv, $ic) = $m{loadings}->pca_sorti() pdl> p "$idv->[$_]\t" . $m{loadings}->($_,$ic)->flat . "\n" for (list $iv) # COMP0 COMP1 COMP2 COMP3 HAPPY [0.860191 0.364911 0.174372 -0.10484] GOOD [0.848694 0.303652 0.198378 -0.115177] CALM [0.821177 -0.130542 0.396215 -0.125368] BRIGHT [0.78303 0.232808 -0.0534081 -0.0528796] HEAVY [-0.623036 0.454826 0.50447 0.073007] HARD [-0.679179 0.0505568 0.384467 0.165608] ACTIVE [-0.161098 0.760778 -0.44893 -0.0888592] FAST [-0.196042 0.71479 -0.471355 0.00460276] LARGE [-0.241994 0.594644 0.634703 -0.00618055] BASS [-0.621213 -0.124918 0.0605367 -0.765184] =cut *pca_sorti = \&PDL::pca_sorti; sub PDL::pca_sorti { # $self is pdl (var x component) my ($self, $opt) = @_; my %opt = ( NCOMP => 10, # maximum number of components to consider ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } my $ncomp = pdl($opt{NCOMP}, $self->dim(1))->min; $self = $self->dice_axis( 1, pdl(0..$ncomp-1) ); my $icomp = $self->transpose->abs->maximum_ind; # sort between comp my $ivar_sort = $icomp->qsorti; $self = $self->slice($ivar_sort)->sever; # sort within comp my $ic = $icomp->slice($ivar_sort)->iv_cluster; for my $comp (0 .. $ic->dim(1)-1) { my $i = $self->slice(which($ic->slice(':',$comp)), "($comp)")->qsorti->slice('-1:0'); $ivar_sort->slice(which $ic->slice(':',$comp)) .= $ivar_sort->slice(which $ic->slice(':',$comp))->slice($i); } wantarray ? ($ivar_sort, pdl(0 .. $ic->dim(1)-1)) : $ivar_sort; } =head2 plot_means Plots means anova style. Can handle up to 4-way interactions (ie 4D pdl). =for options Default options (case insensitive): IVNM => ['IV_0', 'IV_1', 'IV_2', 'IV_3'], DVNM => 'DV', AUTO => 1, # auto set dims to be on x-axis, line, panel # if set 0, dim 0 goes on x-axis, dim 1 as lines # dim 2+ as panels # see PDL::Graphics::Simple for next option WIN => undef, # pgswin object. not closed here if passed # allows comparing multiple lines in same plot SIZE => 640, # individual square panel size in pixels =for usage Usage: # see anova for mean / se pdl structure $mean->plot_means( $se, {IVNM=>['apple', 'bake']} ); Or like this: $m{'| apple ~ bake | m'}->plot_means; =cut *plot_means = \&PDL::plot_means; sub PDL::plot_means { require PDL::Graphics::Simple; my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef; my ($self, $se) = @_; $self = $self->squeeze; if ($self->ndims > 4) { carp "Data is > 4D. No plot here."; return; } my %opt = ( IVNM => ['IV_0', 'IV_1', 'IV_2', 'IV_3'], DVNM => 'DV', AUTO => 1, # auto set vars to be on X axis, line, panel WIN => undef, # PDL::Graphics::Simple object SIZE => 640, # individual square panel size in pixels ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt } # decide which vars to plot as x axis, lines, panels # put var w most levels on x axis # put var w least levels on diff panels my @iD = 0..3; my @dims = (1, 1, 1, 1); # splice ARRAY,OFFSET,LENGTH,LIST splice @dims, 0, $self->ndims, $self->dims; $self = $self->reshape(@dims)->sever; $se = $se->reshape(@dims)->sever if defined $se; @iD = reverse list qsorti pdl @dims if $opt{AUTO}; # $iD[0] on x axis # $iD[1] as separate lines my $nx = $self->dim($iD[2]); # n xpanels my $ny = $self->dim($iD[3]); # n ypanels my $w = $opt{WIN} || PDL::Graphics::Simple::pgswin( size=>[$opt{SIZE}*$nx, $opt{SIZE}*$ny,'px']); my $seq0 = sequence(my $dim0 = $self->dim($iD[0])); my ($pcount, @plots) = 0; for my $y (0..$ny-1) { for my $x (0..$nx-1) { my $key_prefix = "$opt{IVNM}[$iD[0]]|"; $key_prefix .= $opt{IVNM}[$iD[2]] . "=$x|" if $nx > 1; $key_prefix .= $opt{IVNM}[$iD[3]] . "=$y|" if $ny > 1; for (0 .. $self->dim($iD[1]) - 1) { my $ke = "$key_prefix$opt{IVNM}[$iD[1]]=$_"; my $ydiced = $self->dice_axis($iD[3],$y)->dice_axis($iD[2],$x)->dice_axis($iD[1],$_)->squeeze; push @plots, with=>'lines', ke=>"$ke mean", style=>$pcount, $seq0+$pcount*0.05, $ydiced; push @plots, with=>'errorbars', ke=>"$ke error", style=>$pcount, $seq0+$pcount*0.05, $ydiced, $se->dice_axis($iD[3],$y)->dice_axis($iD[2],$x) ->dice_axis($iD[1],$_)->squeeze if defined($se); $pcount++; } } } my ($ymin, $ymax) = pdl($self, !defined $se ? () : ($self+$se, $self-$se))->minmax; $w->plot(@plots, { xlabel=>$opt{IVNM}[$iD[0]], ylabel=>$opt{DVNM}, xrange=>[-0.05,$dim0-1+$pcount*0.05], yrange=>[$ymin-0.05,$ymax+0.05] } ); $w; } =head2 plot_residuals Plots residuals against predicted values. =for usage Usage: use PDL::Graphics::Simple; $w = pgswin(); $y->plot_residuals( $y_pred, { win=>$w } ); =for options Default options (case insensitive): # see PDL::Graphics::Simple for more info WIN => undef, # pgswin object. not closed here if passed # allows comparing multiple lines in same plot SIZE => 640, # plot size in pixels COLOR => 1, =cut *plot_residuals = \&PDL::plot_residuals; sub PDL::plot_residuals { require PDL::Graphics::Simple; my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef; my ($y, $y_pred) = @_; my %opt = ( # see PDL::Graphics::Simple for next options WIN => undef, # pgswin object. not closed here if passed # allows comparing multiple lines in same plot SIZE => 640, # plot size in pixels COLOR => 1, ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } my $residuals = $y - $y_pred; my $win = $opt{WIN} || PDL::Graphics::Simple::pgswin(size=>[@opt{qw(SIZE SIZE)}, 'px']); $win->plot( with=>'points', style=>$opt{COLOR}, $y_pred, $residuals, with=>'lines', style=>$opt{COLOR}, pdl($y_pred->minmax), pdl(0,0), # 0-line {xlabel=>'predicted value', ylabel=>'residuals'}, ); } =head2 plot_scores Plots standardized original and PCA transformed scores against two components. (Thank you, Bob MacCallum, for the documentation suggestion that led to this function.) =for options Default options (case insensitive): CORR => 1, # boolean. PCA was based on correlation or covariance COMP => [0,1], # indices to components to plot # see PDL::Graphics::Simple for next options WIN => undef, # pgswin object. not closed here if passed # allows comparing multiple lines in same plot SIZE => 640, # plot size in pixels COLOR => [1,2], # color for original and rotated scores =for usage Usage: my %p = $data->pca(); $data->plot_scores( $p{eigenvector}, \%opt ); =cut *plot_scores = \&PDL::plot_scores; sub PDL::plot_scores { require PDL::Graphics::Simple; my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef; my ($self, $eigvec) = @_; my %opt = ( CORR => 1, # boolean. PCA was based on correlation or covariance COMP => [0,1], # indices to components to plot # see PDL::Graphics::Simple for next options WIN => undef, # pgswin object. not closed here if passed # allows comparing multiple lines in same plot SIZE => 640, # plot size in pixels COLOR => [1,2], # color for original and transformed scoress ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt } my $i = pdl $opt{COMP}; my $z = $opt{CORR} ? $self->stddz : $self->dev_m; # transformed normed values my $scores = sumover($eigvec->slice(':',$i) * $z->transpose->dummy(1))->transpose; $z = $z->slice(':',$i)->sever; my $win = $opt{WIN} || PDL::Graphics::Simple::pgswin(size=>[@opt{qw(SIZE SIZE)}, 'px']); $win->plot( with=>'points', style=>$opt{COLOR}[0], ke=>'original', $z->slice(',(0)'), $z->slice(',(1)'), with=>'points', style=>$opt{COLOR}[1], ke=>'transformed', $scores->slice(',(0)'), $scores->slice(',(1)'), {xlabel=>"Component $opt{COMP}[0]", ylabel=>"Component $opt{COMP}[1]"}, ); } =head2 plot_screes Scree plot. Plots proportion of variance accounted for by PCA components. =for options Default options (case insensitive): NCOMP => 20, # max number of components to plot CUT => 0, # set to plot cutoff line after this many components # undef to plot suggested cutoff line for NCOMP comps # see PDL::Graphics::Simple for next options WIN => undef, # pgswin object. not closed here if passed # allows comparing multiple lines in same plot SIZE => 640, # plot size in pixels =for usage Usage: # variance should be in descending order $d = qsort random 10, 5; # 10 obs on 5 variables %pca = $d->pca( \%opt ); $pca{pct_var}->plot_screes( {ncomp=>16, win=>$win=PDL::Graphics::Simple::pgswin()} ); Or, because NCOMP is used so often, it is allowed a shortcut, $pca{pct_var}->plot_screes( 16 ); =cut *plot_scree = \&PDL::plot_screes; # here for now for compatibility *plot_screes = \&PDL::plot_screes; sub PDL::plot_screes { require PDL::Graphics::Simple; my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef; my ($self, $ncomp) = @_; my %opt = ( NCOMP => 20, # max number of components to plot CUT => 0, # set to plot cutoff line after this many components # undef to plot suggested cutoff line for NCOMP comps # see PDL::Graphics::Simple for next options WIN => undef, # pgswin object. not closed here if passed # allows comparing multiple lines in same plot SIZE => 640, # plot size in pixels ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } $opt{NCOMP} = $ncomp if $ncomp; # re-use $ncomp below $ncomp = ($self->dim(0) < $opt{NCOMP})? $self->dim(0) : $opt{NCOMP}; my $self_comp = $self->slice([0,$ncomp-1]); $opt{CUT} = PDL::Stats::Kmeans::_scree_ind $self_comp if !defined $opt{CUT}; my $win = $opt{WIN} || PDL::Graphics::Simple::pgswin(size=>[@opt{qw(SIZE SIZE)}, 'px']); $win->plot( with=>'lines', ke=>'scree', sequence($ncomp), $self_comp, !$opt{CUT} ? () : (with=>'lines', ke=>'cut', pdl($opt{CUT}-.5, $opt{CUT}-.5), pdl(-.05, $self->max->sclr+.05)), {xlabel=>'Component', ylabel=>'Proportion of Variance Accounted for', xrange=>[-0.05,$ncomp-0.95], yrange=>[0,1], le=>'tr'}, ); } =head2 plot_stripchart Stripchart plot. Plots ANOVA-style data, categorised against given IVs. =for options Default options (case insensitive): IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] DVNM => 'DV', # see PDL::Graphics::Simple for next options WIN => undef, # pgswin object. not closed here if passed =for usage Usage: %m = $y->plot_stripchart( $a, \@b, { IVNM=>[qw(apple bake)] } ); =cut my $CHART_GAP = 0.1; *plot_stripchart = \&PDL::plot_stripchart; sub PDL::plot_stripchart { require PDL::Graphics::Simple; my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef; my ($y, @ivs_raw) = @_; my %opt = ( IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] DVNM => 'DV', WIN => undef, # pgswin object. not closed here if passed ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } $opt{IVNM} = [ map { "IV_$_" } 0 .. $#ivs_raw ] if !$opt{IVNM} or !@{ $opt{IVNM} }; my $w = $opt{WIN} || PDL::Graphics::Simple::pgswin(); my @codes = map [code_ivs($_)], @ivs_raw; my @levels = map { my $map = $_->[1]; [sort {$map->{$a} <=> $map->{$b}} keys %$map]; } @codes; my $xjitter = $y->random * $CHART_GAP; my ($pcount, @plots) = 0; push @plots, with=>'points', ke=>"all data", $xjitter+$pcount, $y; $pcount++; for my $i (0..$#ivs_raw) { my $levs = $levels[$i]; my $name = $opt{IVNM}[$i]; my $coded = $codes[$i][0]; for my $j (0..$#$levs) { my $inds = which($coded == $j); push @plots, with=>'points', ke=>"$name=$levs->[$j]", $xjitter->slice($inds)+$pcount+$j*$CHART_GAP, $y->slice($inds); } $pcount++; } my ($ymin, $ymax) = $y->minmax; my $xmax = $pcount-1 + $CHART_GAP*($#{$levels[-1]} + 2); $w->plot(@plots, { xlabel=>'IV', ylabel=>$opt{DVNM}, xrange=>[-1,$xmax], yrange=>[$ymin-$CHART_GAP,$ymax+$CHART_GAP] } ); $w; } =head1 SEE ALSO L L =head1 REFERENCES Cohen, J., Cohen, P., West, S.G., & Aiken, L.S. (2003). Applied Multiple Regression/correlation Analysis for the Behavioral Sciences (3rd ed.). Mahwah, NJ: Lawrence Erlbaum Associates Publishers. Hosmer, D.W., & Lemeshow, S. (2000). Applied Logistic Regression (2nd ed.). New York, NY: Wiley-Interscience. Lorch, R.F., & Myers, J.L. (1990). Regression analyses of repeated measures data in cognitive research. Journal of Experimental Psychology: Learning, Memory, & Cognition, 16, 149-157. Osgood C.E., Suci, G.J., & Tannenbaum, P.H. (1957). The Measurement of Meaning. Champaign, IL: University of Illinois Press. Rutherford, A. (2011). ANOVA and ANCOVA: A GLM Approach (2nd ed.). Wiley. Shlens, J. (2009). A Tutorial on Principal Component Analysis. Retrieved April 10, 2011 from http://citeseerx.ist.psu.edu/ The GLM procedure: unbalanced ANOVA for two-way design with interaction. (2008). SAS/STAT(R) 9.2 User's Guide. Retrieved June 18, 2009 from http://support.sas.com/ Van den Noortgate, W., & Onghena, P. (2006). Analysing repeated measures data in cognitive research: A comment on regression coefficient analyses. European Journal of Cognitive Psychology, 18, 937-952. =head1 AUTHOR Copyright (C) 2009 Maggie J. Xiong All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution. =cut EOD pp_done(); PDL-Stats-0.855/lib/PDL/Stats/Basic.pd0000644000175000017500000006536114762213552017047 0ustar osboxesosboxesuse strict; use warnings; my $F = [qw(F D)]; pp_add_exported(qw(binomial_test rtable which_id code_ivs )); pp_addpm({At=>'Top'}, <<'EOD'); use strict; use warnings; use PDL::LiteF; use Carp; eval { require PDL::Core; require PDL::GSL::CDF; }; my $CDF = 1 if !$@; =head1 NAME PDL::Stats::Basic -- basic statistics and related utilities such as standard deviation, Pearson correlation, and t-tests. =head1 DESCRIPTION The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are broadcastable and methods that are NOT broadcastable, respectively. Does not have mean or median function here. see SEE ALSO. =head1 SYNOPSIS use PDL::LiteF; use PDL::Stats::Basic; my $stdv = $data->stdv; or my $stdv = stdv( $data ); =cut EOD pp_addhdr(' #include ' ); pp_def('stdv', Pars => 'a(n); [o]b()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(b) sa = 0, a2 = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,) sa += $a(); a2 += $a() * $a(); %} if (N < 1) { $SETBAD(b()); continue; } $GENERIC() var = a2 / N - (sa/N)*(sa/N); if (var < 0) var = 0; $b() = sqrt(var); ', Doc => 'Sample standard deviation.', ); pp_def('stdv_unbiased', Pars => 'a(n); [o]b()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(b) sa = 0, a2 = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,) sa += $a(); a2 += $a() * $a(); %} if (N < 2) { $SETBAD(b()); continue; } $GENERIC() var = a2/(N-1) - sa*sa/(N*(N-1)); if (var < 0) var = 0; $b() = sqrt(var); ', Doc => 'Unbiased estimate of population standard deviation.', ); pp_def('var', Pars => 'a(n); [o]b()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(b) a2 = 0, sa = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,) sa += $a(); a2 += $a() * $a(); %} if (N < 1) { $SETBAD(b()); continue; } $b() = a2 / N - sa*sa/(N*N); ', Doc => 'Sample variance.', ); pp_def('var_unbiased', Pars => 'a(n); [o]b()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(b) a2 = 0, sa = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,) a2 += $a() * $a(); sa += $a(); %} if (N < 2) { $SETBAD(b()); continue; } $b() = (a2 - sa*sa/N) / (N-1); ', Doc => 'Unbiased estimate of population variance.', ); pp_def('se', Pars => 'a(n); [o]b()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(b) sa = 0, a2 = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,) sa += $a(); a2 += $a() * $a(); %} $GENERIC() se2 = (a2 - sa*sa/N) / (N*(N-1)); if (se2 < 0) se2 = 0; $b() = sqrt(se2); ', Doc => ' =for ref Standard error of the mean. Useful for calculating confidence intervals. =for example # 95% confidence interval for samples with large N $ci_95_upper = $data->average + 1.96 * $data->se; $ci_95_lower = $data->average - 1.96 * $data->se; ', ); pp_def('ss', Pars => 'a(n); [o]b()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(b) sa = 0, a2 = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,) sa += $a(); a2 += $a() * $a(); %} if (N < 1) { $SETBAD(b()); continue; } $b() = a2 - sa*sa/N; ', Doc => 'Sum of squared deviations from the mean.', ); pp_def('skew', Pars => 'a(n); [o]b()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(b) sa = 0, m = 0, d=0, d2 = 0, d3 = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,) sa += $a(); %} if (N < 1) { $SETBAD(b()); continue; } m = sa / N; loop (n) %{ if ( $ISGOOD($a()) ) { d = $a() - m; d2 += d*d; d3 += d*d*d; } %} $b() = d3/N / pow(d2/N, 1.5); ', Doc => 'Sample skewness, measure of asymmetry in data. skewness == 0 for normal distribution.', ); pp_def('skew_unbiased', Pars => 'a(n); [o]b()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(b) sa = 0, m = 0, d=0, d2 = 0, d3 = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,) sa += $a(); %} if (N < 3) { $SETBAD(b()); continue; } m = sa / N; loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue;,) d = $a() - m; d2 += d*d; d3 += d*d*d; %} $b() = sqrt(N*(N-1)) / (N-2) * d3/N / pow(d2/N, 1.5); ', Doc => 'Unbiased estimate of population skewness. This is the number in GNumeric Descriptive Statistics.', ); pp_def('kurt', Pars => 'a(n); [o]b()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(b) sa = 0, m = 0, d=0, d2 = 0, d4 = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,) sa += $a(); %} if (N < 1) { $SETBAD(b()); continue; } m = sa / N; loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue;,) d = $a() - m; d2 += d*d; d4 += d*d*d*d; %} $b() = N * d4 / (d2*d2) - 3; ', Doc => 'Sample kurtosis, measure of "peakedness" of data. kurtosis == 0 for normal distribution.', ); pp_def('kurt_unbiased', Pars => 'a(n); [o]b()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(b) sa = 0, m = 0, d=0, d2 = 0, d4 = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,) sa += $a(); %} if (N < 4) { $SETBAD(b()); continue; } m = sa / N; loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue;,) d = $a() - m; d2 += d*d; d4 += d*d*d*d; %} $b() = ((N-1)*N*(N+1) * d4 / (d2*d2) - 3 * (N-1)*(N-1)) / ((N-2)*(N-3)); ', Doc => 'Unbiased estimate of population kurtosis. This is the number in GNumeric Descriptive Statistics.', ); pp_def('cov', Pars => 'a(n); b(n); [o]c()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(c) ab = 0, sa = 0, sb = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a()) || $ISBAD($b())) continue; N++;,) ab += $a() * $b(); sa += $a(); sb += $b(); %} if (N < 1) { $SETBAD(c()); continue; } $c() = ab / N - (sa/N) * (sb/N); ', Doc => 'Sample covariance. see B for ways to call', ); pp_def('cov_table', Pars => 'a(n,m); [o]c(m,m)', HandleBad => 1, RedoDimsCode => 'if ($SIZE(n) < 2) $CROAK("too few N");', Code => ' $GENERIC(a) a_, b_; PDL_Indx M = $SIZE(m), i, j; for (i=0; ii)) || $ISBAD($a(m=>j))) continue; N++;,) sa += a_ = $a(m=>i); sb += b_ = $a(m=>j); ab += a_ * b_; %} if (N < 2) { $SETBAD($c(m0=>i, m1=>j)); $SETBAD($c(m0=>j, m1=>i)); continue; } $GENERIC(c) cov = ab - (sa * sb) / N; $c(m0=>i, m1=>j) = $c(m0=>j, m1=>i) = cov / N; } } ', Doc => ' =for ref Square covariance table. Gives the same result as broadcasting using B but it calculates only half the square, hence much faster. And it is easier to use with higher dimension pdls. =for example Usage: # 5 obs x 3 var, 2 such data tables pdl> $a = random 5, 3, 2 pdl> p $cov = $a->cov_table [ [ [ 8.9636438 -1.8624472 -1.2416588] [-1.8624472 14.341514 -1.4245366] [-1.2416588 -1.4245366 9.8690655] ] [ [ 10.32644 -0.31311789 -0.95643674] [-0.31311789 15.051779 -7.2759577] [-0.95643674 -7.2759577 5.4465141] ] ] # diagonal elements of the cov table are the variances pdl> p $a->var [ [ 8.9636438 14.341514 9.8690655] [ 10.32644 15.051779 5.4465141] ] for the same cov matrix table using B, pdl> p $a->dummy(2)->cov($a->dummy(1)) ', ); pp_def('corr', Pars => 'a(n); b(n); [o]c()', GenericTypes => $F, HandleBad => 1, RedoDimsCode => 'if ($SIZE(n) < 2) $CROAK("too few N");', Code => ' $GENERIC(c) ab, sa, sb, a2, b2, cov, va, vb; ab=0; sa=0; sb=0; a2=0; b2=0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a()) || $ISBAD($b())) continue; N++;,) ab += $a() * $b(); sa += $a(); sb += $b(); a2 += $a() * $a(); b2 += $b() * $b(); %} if (N < 2) { $SETBAD(c()); continue; } cov = ab - (sa * sb) / N; va = a2 - sa*sa / N; vb = b2 - sb*sb / N; $c() = cov / sqrt( va * vb ); ', Doc => ' =for ref Pearson correlation coefficient. r = cov(X,Y) / (stdv(X) * stdv(Y)). =for example Usage: pdl> $a = random 5, 3 pdl> $b = sequence 5,3 pdl> p $a->corr($b) [0.20934208 0.30949881 0.26713007] for square corr table pdl> p $a->corr($a->dummy(1)) [ [ 1 -0.41995259 -0.029301192] [ -0.41995259 1 -0.61927619] [-0.029301192 -0.61927619 1] ] but it is easier and faster to use B. ', ); pp_def('corr_table', Pars => 'a(n,m); [o]c(m,m)', HandleBad => 1, RedoDimsCode => 'if ($SIZE(n) < 2) $CROAK("too few N");', Code => ' $GENERIC(a) a_, b_; $GENERIC(c) ab, sa, sb, a2, b2, cov, va, vb, r; PDL_Indx M = $SIZE(m), i, j; for (i=0; ii)) || $ISBAD($a(m=>j))) continue; N++;,) sa += a_ = $a(m=>i); sb += b_ = $a(m=>j); ab += a_ * b_; a2 += a_ * a_; b2 += b_ * b_; %} if (N < 2) { $SETBAD($c(m0=>i, m1=>j)); $SETBAD($c(m0=>j, m1=>i)); continue; } cov = ab - (sa * sb) / N; va = a2 - sa*sa / N; vb = b2 - sb*sb / N; r = cov / sqrt( va * vb ); $c(m0=>i, m1=>j) = $c(m0=>j, m1=>i) = r; } PDL_IF_BAD(PDL_Indx N = 0; loop (n) %{ if ($ISGOOD($a(m=>i))) N ++; if (N > 1) break; %} if (N < 2) { $SETBAD($c(m0=>i, m1=>i)); continue; },) $c(m0=>i, m1=>i) = 1.0; } ', Doc => ' =for ref Square Pearson correlation table. Gives the same result as broadcasting using B but it calculates only half the square, hence much faster. And it is easier to use with higher dimension pdls. =for example Usage: # 5 obs x 3 var, 2 such data tables pdl> $a = random 5, 3, 2 pdl> p $a->corr_table [ [ [ 1 -0.69835951 -0.18549048] [-0.69835951 1 0.72481605] [-0.18549048 0.72481605 1] ] [ [ 1 0.82722569 -0.71779883] [ 0.82722569 1 -0.63938828] [-0.71779883 -0.63938828 1] ] ] for the same result using B, pdl> p $a->dummy(2)->corr($a->dummy(1)) This is also how to use B and B with such a table. ', ); pp_def('t_corr', Pars => 'r(); n(); [o]t()', GenericTypes => $F, HandleBad => 1, Code => ' PDL_IF_BAD( if ($ISBAD(r()) || $ISBAD(n()) ) { $SETBAD( $t() ); continue; } if ($n() <= 2) { $SETBAD(t()); continue; } ,) $t() = $r() / pow( (1 - $r()*$r()) / ($n() - 2) , .5); ', Doc => ' =for ref t significance test for Pearson correlations. =for example $corr = $data->corr( $data->dummy(1) ); $n = $data->n_pair( $data->dummy(1) ); $t_corr = $corr->t_corr( $n ); use PDL::GSL::CDF; $p_2tail = 2 * (1 - gsl_cdf_tdist_P( $t_corr->abs, $n-2 )); ', ); pp_def('n_pair', Pars => 'a(n); b(n); indx [o]c()', GenericTypes => [qw/L Q/], HandleBad => 1, Code => ' PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); PDL_IF_BAD(loop(n) %{ if ($ISBAD($a()) || $ISBAD($b())) continue; N++; %},) $c() = N; ', Doc => 'Returns the number of good pairs between 2 lists. Useful with B (esp. when bad values are involved)', ); pp_def('corr_dev', Pars => 'a(n); b(n); [o]c()', GenericTypes => $F, HandleBad => 1, RedoDimsCode => 'if ($SIZE(n) < 2) $CROAK("too few N");', Code => ' $GENERIC(c) ab, a2, b2, cov, va, vb; ab = 0; a2 = 0; b2 = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop (n) %{ PDL_IF_BAD(if ($ISBAD($a()) || $ISBAD($b())) continue; N++;,) ab += $a() * $b(); a2 += $a() * $a(); b2 += $b() * $b(); %} if (N < 2) { $SETBAD(c()); continue; } cov = ab / N; va = a2 / N; vb = b2 / N; $c() = cov / sqrt( va * vb ); ', Doc => 'Calculates correlations from B vals. Seems faster than doing B from original vals when data pdl is big', ); pp_def('t_test', Pars => 'a(n); b(m); [o]t(); [o]d()', GenericTypes => $F, HandleBad => 1, RedoDimsCode => ' if ($SIZE(n) < 2) $CROAK("too few N"); if ($SIZE(m) < 2) $CROAK("too few M"); ', Code => ' $GENERIC(t) N = PDL_IF_BAD(0,$SIZE(n)), M = PDL_IF_BAD(0,$SIZE(m)), sa = 0, sb = 0, a2 = 0, b2 = 0; loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,) sa += $a(); a2 += $a() * $a(); %} loop (m) %{ PDL_IF_BAD(if ($ISBAD($b())) continue;,) sb += $b(); b2 += $b() * $b(); PDL_IF_BAD(M++;,) %} if (N < 2 || M < 2) { $SETBAD($t()); $SETBAD($d()); continue; } $d() = N + M - 2; $GENERIC(t) va = (a2 - sa*sa/N) / (N-1); $GENERIC(t) vb = (b2 - sb*sb/M) / (M-1); $GENERIC(t) sdiff = sqrt( (1/N + 1/M) * ((N-1)*va + (M-1)*vb) / $d() ); $t() = (sa/N - sb/M) / sdiff; ', Doc => ' =for ref Independent sample t-test, assuming equal var. =for example my ($t, $df) = t_test( $pdl1, $pdl2 ); use PDL::GSL::CDF; my $p_2tail = 2 * (1 - gsl_cdf_tdist_P( $t->abs, $df )); ', ); pp_def('t_test_nev', Pars => 'a(n); b(m); [o]t(); [o]d()', GenericTypes => $F, HandleBad => 1, RedoDimsCode => ' if ($SIZE(n) < 2) $CROAK("too few N"); if ($SIZE(m) < 2) $CROAK("too few M"); ', Code => ' $GENERIC(t) N = PDL_IF_BAD(0,$SIZE(n)), M = PDL_IF_BAD(0,$SIZE(m)), sa = 0, sb = 0, a2 = 0, b2 = 0; loop (n) %{ PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,) sa += $a(); a2 += $a() * $a(); %} loop (m) %{ PDL_IF_BAD(if ($ISBAD($b())) continue; M++;,) sb += $b(); b2 += $b() * $b(); %} if (N < 2 || M < 2) { $SETBAD($t()); $SETBAD($d()); continue; } $GENERIC(t) se_a_2 = (a2 - sa*sa/N) / (N*(N-1)); $GENERIC(t) se_b_2 = (b2 - sb*sb/M) / (M*(M-1)); $GENERIC(t) sdiff = sqrt( se_a_2 + se_b_2 ); $t() = (sa/N - sb/M) / sdiff; $d() = (se_a_2 + se_b_2)*(se_a_2 + se_b_2) / ( se_a_2*se_a_2 / (N-1) + se_b_2*se_b_2 / (M-1) ) ; ', Doc => 'Independent sample t-test, NOT assuming equal var. ie Welch two sample t test. Df follows Welch-Satterthwaite equation instead of Satterthwaite (1946, as cited by Hays, 1994, 5th ed.). It matches GNumeric, which matches R.', ); pp_def('t_test_paired', Pars => 'a(n); b(n); [o]t(); [o]d()', GenericTypes => $F, HandleBad => 1, RedoDimsCode => 'if ($SIZE(n) < 2) $CROAK("too few N");', Code => ' $GENERIC(t) N = PDL_IF_BAD(0,$SIZE(n)), s_dif = 0, diff2 = 0; loop (n) %{ PDL_IF_BAD(if ($ISBAD($a()) || $ISBAD($b())) continue; N++;,) $GENERIC(t) diff = $a() - $b(); s_dif += diff; diff2 += diff*diff; %} if (N < 2) { $SETBAD($t()); $SETBAD($d()); continue; } $d() = N - 1; $t() = s_dif / sqrt( ( N*diff2 - s_dif*s_dif ) / (N-1) ); ', Doc => 'Paired sample t-test.', ); pp_addpm pp_line_numbers(__LINE__, <<'EOD'); =head2 binomial_test =for Sig Signature: (x(); n(); p_expected(); [o]p()) =for ref Binomial test. One-tailed significance test for two-outcome distribution. Given the number of successes, the number of trials, and the expected probability of success, returns the probability of getting this many or more successes. This function does NOT currently support bad value in the number of successes. =for example Usage: # assume a fair coin, ie. 0.5 probablity of getting heads # test whether getting 8 heads out of 10 coin flips is unusual my $p = binomial_test( 8, 10, 0.5 ); # 0.0107421875. Yes it is unusual. =cut *binomial_test = \&PDL::binomial_test; sub PDL::binomial_test { my ($x, $n, $P) = @_; carp 'Please install PDL::GSL::CDF.' unless $CDF; carp 'This function does NOT currently support bad value in the number of successes.' if $x->badflag(); my $pdlx = pdl($x); $pdlx->badflag(1); $pdlx = $pdlx->setvaltobad(0); my $p = 1 - PDL::GSL::CDF::gsl_cdf_binomial_P( $pdlx - 1, $P, $n ); $p = $p->setbadtoval(1); $p->badflag(0); return $p; } =head1 METHODS =head2 rtable =for ref Reads either file or file handle*. Returns observation x variable pdl and var and obs ids if specified. Ids in perl @ ref to allow for non-numeric ids. Other non-numeric entries are treated as missing, which are filled with $opt{MISSN} then set to BAD*. Can specify num of data rows to read from top but not arbitrary range. *If passed handle, it will not be closed here. =for options Default options (case insensitive): V => 1, # verbose. prints simple status TYPE => double, C_ID => 1, # boolean. file has col id. R_ID => 1, # boolean. file has row id. R_VAR => 0, # boolean. set to 1 if var in rows SEP => "\t", # can take regex qr// MISSN => -999, # this value treated as missing and set to BAD NROW => '', # set to read specified num of data rows =for usage Usage: Sample file diet.txt: uid height weight diet akw 72 320 1 bcm 68 268 1 clq 67 180 2 dwm 70 200 2 ($data, $idv, $ido) = rtable 'diet.txt'; # By default prints out data info and @$idv index and element reading diet.txt for data and id... OK. data table as PDL dim o x v: PDL: Double D [4,3] 0 height 1 weight 2 diet Another way of using it, $data = rtable( \*STDIN, {TYPE=>long} ); =cut sub rtable { # returns obs x var data matrix and var and obs ids my ($src, $opt) = @_; my $fh_in; if ($src =~ /STDIN/ or ref $src eq 'GLOB') { $fh_in = $src } else { open $fh_in, $src or croak "$!" } my %opt = ( V => 1, TYPE => double, C_ID => 1, R_ID => 1, R_VAR => 0, SEP => "\t", MISSN => -999, NROW => '', ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } $opt{V} and print "reading $src for data and id... "; local $PDL::undefval = $opt{MISSN}; my $id_c = []; # match declaration of $id_r for return purpose if ($opt{C_ID}) { chomp( $id_c = <$fh_in> ); my @entries = split $opt{SEP}, $id_c; $opt{R_ID} and shift @entries; $id_c = \@entries; } my ($c_row, $id_r, $data, @data) = (0, [], PDL->null, ); while (<$fh_in>) { chomp; my @entries = split /$opt{SEP}/, $_, -1; $opt{R_ID} and push @$id_r, shift @entries; # rudimentary check for numeric entry for (@entries) { $_ = $opt{MISSN} unless defined $_ and m/\d\b/ } push @data, pdl( $opt{TYPE}, \@entries ); $c_row ++; last if $opt{NROW} and $c_row == $opt{NROW}; } # not explicitly closing $fh_in here in case it's passed from outside # $fh_in will close by going out of scope if opened here. $data = pdl $opt{TYPE}, @data; @data = (); # rid of last col unless there is data there $data = $data->slice([0, $data->getdim(0)-2])->sever unless ( nelem $data->slice(-1)->where($data->slice(-1) != $opt{MISSN}) ); my ($idv, $ido) = ($id_r, $id_c); # var in columns instead of rows $opt{R_VAR} == 0 and ($data, $idv, $ido) = ($data->inplace->transpose, $id_c, $id_r); if ($opt{V}) { print "OK.\ndata table as PDL dim o x v: " . $data->info . "\n"; $idv and print "$_\t$$idv[$_]\n" for 0..$#$idv; } $data = $data->setvaltobad( $opt{MISSN} ); $data->check_badflag; return wantarray? (@$idv? ($data, $idv, $ido) : ($data, $ido)) : $data; } =head2 group_by Returns pdl reshaped according to the specified factor variable. Most useful when used in conjunction with other broadcasting calculations such as average, stdv, etc. When the factor variable contains unequal number of cases in each level, the returned pdl is padded with bad values to fit the level with the most number of cases. This allows the subsequent calculation (average, stdv, etc) to return the correct results for each level. Usage: # simple case with 1d pdl and equal number of n in each level of the factor pdl> p $a = sequence 10 [0 1 2 3 4 5 6 7 8 9] pdl> p $factor = $a > 4 [0 0 0 0 0 1 1 1 1 1] pdl> p $a->group_by( $factor )->average [2 7] # more complex case with broadcasting and unequal number of n across levels in the factor pdl> p $a = sequence 10,2 [ [ 0 1 2 3 4 5 6 7 8 9] [10 11 12 13 14 15 16 17 18 19] ] pdl> p $factor = qsort $a( ,0) % 3 [ [0 0 0 0 1 1 1 2 2 2] ] pdl> p $a->group_by( $factor ) [ [ [ 0 1 2 3] [10 11 12 13] ] [ [ 4 5 6 BAD] [ 14 15 16 BAD] ] [ [ 7 8 9 BAD] [ 17 18 19 BAD] ] ] ARRAY(0xa2a4e40) # group_by supports perl factors, multiple factors # returns factor labels in addition to pdl in array context pdl> p $a = sequence 12 [0 1 2 3 4 5 6 7 8 9 10 11] pdl> $odd_even = [qw( e o e o e o e o e o e o )] pdl> $magnitude = [qw( l l l l l l h h h h h h )] pdl> ($a_grouped, $label) = $a->group_by( $odd_even, $magnitude ) pdl> p $a_grouped [ [ [0 2 4] [1 3 5] ] [ [ 6 8 10] [ 7 9 11] ] ] pdl> p Dumper $label $VAR1 = [ [ 'e_l', 'o_l' ], [ 'e_h', 'o_h' ] ]; =cut *group_by = \&PDL::group_by; sub PDL::group_by { my $p = shift; my @factors = @_; if ( @factors == 1 ) { my $factor = $factors[0]; my $label; if (ref $factor eq 'ARRAY') { $label = _ordered_uniq($factor); $factor = code_ivs($factor); } else { my $perl_factor = [$factor->list]; $label = _ordered_uniq($perl_factor); } my $p_reshaped = _group_by_single_factor( $p, $factor ); return wantarray? ($p_reshaped, $label) : $p_reshaped; } # make sure all are arrays instead of pdls @factors = map { ref($_) eq 'PDL'? [$_->list] : $_ } @factors; my (@cells); for my $ele (0 .. $#{$factors[0]}) { my $c = join '_', map { $_->[$ele] } @factors; push @cells, $c; } # get uniq cell labels (ref List::MoreUtils::uniq) my %seen; my @uniq_cells = grep {! $seen{$_}++ } @cells; my $flat_factor = code_ivs( \@cells ); my $p_reshaped = _group_by_single_factor( $p, $flat_factor ); # get levels of each factor and reshape accordingly my @levels; for (@factors) { my %uniq; @uniq{ @$_ } = (); push @levels, scalar keys %uniq; } $p_reshaped = $p_reshaped->reshape( $p_reshaped->dim(0), @levels )->sever; # make labels for the returned data structure matching pdl structure my @labels; if (wantarray) { for my $ifactor (0 .. $#levels) { my @factor_label; for my $ilevel (0 .. $levels[$ifactor]-1) { my $i = $ifactor * $levels[$ifactor] + $ilevel; push @factor_label, $uniq_cells[$i]; } push @labels, \@factor_label; } } return wantarray? ($p_reshaped, \@labels) : $p_reshaped; } # get uniq cell labels (ref List::MoreUtils::uniq) sub _ordered_uniq { my $arr = shift; my %seen; my @uniq = grep { ! $seen{$_}++ } @$arr; return \@uniq; } sub _group_by_single_factor { my $p = shift; my $factor = shift; $factor = $factor->squeeze; die "Currently support only 1d factor pdl." if $factor->ndims > 1; die "Data pdl and factor pdl do not match!" unless $factor->dim(0) == $p->dim(0); # get active dim that will be split according to factor and dims to broadcast over my @p_broadcastdims = $p->dims; my $p_dim0 = shift @p_broadcastdims; my $uniq = $factor->uniq; my @uniq_ns; for ($uniq->list) { push @uniq_ns, which( $factor == $_ )->nelem; } # get number of n's in each group, find the biggest, fit output pdl to this my $uniq_ns = pdl \@uniq_ns; my $max = pdl(\@uniq_ns)->max->sclr; my $badvalue = int($p->max + 1); my $p_tmp = ones($max, @p_broadcastdims, $uniq->nelem) * $badvalue; for (0 .. $#uniq_ns) { my $i = which $factor == $uniq->slice($_); $p_tmp->dice_axis(-1,$_)->squeeze->slice([0,$uniq_ns[$_]-1]) .= $p->slice($i); } $p_tmp->badflag(1); return $p_tmp->setvaltobad($badvalue); } =head2 which_id =for ref Lookup specified var (obs) ids in $idv ($ido) (see B) and return indices in $idv ($ido) as pdl if found. The indices are ordered by the specified subset. Useful for selecting data by var (obs) id. =for usage my $ind = which_id $ido, ['smith', 'summers', 'tesla']; my $data_subset = $data( $ind, ); # take advantage of perl pattern matching # e.g. use data from people whose last name starts with s my $i = which_id $ido, [ grep { /^s/ } @$ido ]; my $data_s = $data($i, ); =cut sub which_id { my ($id, $id_s) = @_; my %ind; @ind{ @$id } = (0 .. $#$id); pdl grep defined, map $ind{$_}, @$id_s; } my %code_bad = map +($_=>1), '', 'BAD'; sub code_ivs { my ($var_ref) = @_; $var_ref = [ $var_ref->list ] if UNIVERSAL::isa($var_ref, 'PDL'); my @filtered = map !defined($_) || $code_bad{$_} ? undef : $_, @$var_ref; my ($l, %level) = 0; $level{$_} //= $l++ for grep defined, @filtered; my $pdl = pdl(map defined($_) ? $level{$_} : -1, @filtered)->setvaltobad(-1); $pdl->check_badflag; wantarray ? ($pdl, \%level) : $pdl; } =head1 SEE ALSO PDL::Basic (hist for frequency counts) PDL::Ufunc (sum, avg, median, min, max, etc.) PDL::GSL::CDF (various cumulative distribution functions) =head1 REFERENCES Hays, W.L. (1994). Statistics (5th ed.). Fort Worth, TX: Harcourt Brace College Publishers. =head1 AUTHOR Copyright (C) 2009 Maggie J. Xiong All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution. =cut EOD pp_done(); PDL-Stats-0.855/lib/PDL/Stats/TS.pd0000644000175000017500000003564214757264011016353 0ustar osboxesosboxesuse strict; use warnings; my $F = [qw(F D)]; pp_addpm({At=>'Top'}, <<'EOD'); =encoding utf8 =head1 NAME PDL::Stats::TS -- basic time series functions =head1 DESCRIPTION The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are threadable and methods that are NOT threadable, respectively. Plots require L. ***EXPERIMENTAL!*** In particular, bad value support is spotty and may be shaky. USE WITH DISCRETION! =head1 SYNOPSIS use PDL::LiteF; use PDL::Stats::TS; my $r = $data->acf(5); =cut use strict; use warnings; use Carp; use PDL::LiteF; use PDL::Stats::Basic; use PDL::Stats::Kmeans; EOD pp_addhdr(' #include #define Z10 1.64485362695147 #define Z05 1.95996398454005 #define Z01 2.5758293035489 #define Z001 3.29052673149193 ' ); pp_def('acf', Pars => 'x(t); [o]r(h)', OtherPars => 'IV lag=>h', GenericTypes => $F, Code => ' $GENERIC(x) s, s2, m, cov0, covh; s=0; s2=0; m=0; cov0=0; covh=0; PDL_Indx T, i; T = $SIZE(t); loop(t) %{ s += $x(); s2 += $x()*$x(); %} m = s/T; cov0 = s2 - T * m * m; loop (h) %{ if (h) { covh = 0; for (i=0; ii) - m) * ($x(t=>i+h) - m); } $r() = covh / cov0; } else { $r() = 1; } %} ', PMCode => pp_line_numbers(__LINE__, <<'EOF'), sub PDL::acf { my ($self, $h) = @_; $h ||= $self->dim(0) - 1; PDL::_acf_int($self, my $r = PDL->null, $h+1); $r; } EOF Doc => <<'EOD', =for ref Autocorrelation function for up to lag h. If h is not specified it's set to t-1 by default. acf does not process bad values. =for example usage: pdl> $a = sequence 10 # lags 0 .. 5 pdl> p $a->acf(5) [1 0.7 0.41212121 0.14848485 -0.078787879 -0.25757576] EOD ); pp_def('acvf', Pars => 'x(t); [o]v(h)', OtherPars => 'IV lag=>h;', GenericTypes => $F, Code => ' $GENERIC(x) s, s2, m, covh; s=0; s2=0; m=0; covh=0; long T, i; T = $SIZE(t); loop(t) %{ s += $x(); s2 += $x()*$x(); %} m = s/T; loop (h) %{ if (h) { covh = 0; for (i=0; ii) - m) * ($x(t=>i+h) - m); } $v() = covh; } else { $v() = s2 - T * m * m; } %} ', PMCode => pp_line_numbers(__LINE__, <<'EOF'), sub PDL::acvf { my ($self, $h) = @_; $h ||= $self->dim(0) - 1; PDL::_acvf_int($self, my $v = PDL->null, $h+1); $v; } EOF Doc => <<'EOD', =for ref Autocovariance function for up to lag h. If h is not specified it's set to t-1 by default. acvf does not process bad values. =for example usage: pdl> $a = sequence 10 # lags 0 .. 5 pdl> p $a->acvf(5) [82.5 57.75 34 12.25 -6.5 -21.25] # autocorrelation pdl> p $a->acvf(5) / $a->acvf(0) [1 0.7 0.41212121 0.14848485 -0.078787879 -0.25757576] EOD ); pp_def('dseason', Pars => 'x(t); indx d(); [o]xd(t)', GenericTypes => $F, HandleBad => 1, Code => ' PDL_Indx i, max = PDL_IF_BAD(,$SIZE(t))-1, min = PDL_IF_BAD(-1,0); PDL_Indx q = ($d() % 2)? ($d() - 1) / 2 : $d() / 2; /*find good min and max ind*/ loop (t) %{ PDL_IF_BAD(if ($ISBAD($x())) continue;,) if (min < 0) min = t; max = t; %} if ($d() % 2) { loop(t) %{ PDL_IF_BAD(if (t < min || t > max) { $SETBAD(xd()); continue; },) $GENERIC(x) sum = 0; PDL_IF_BAD(PDL_Indx dd = 0;,) for (i=-q; i<=q; i++) { PDL_Indx ti = (t+i < min)? min : (t+i > max)? max : t+i ; PDL_IF_BAD(if ($ISBAD($x(t=>ti))) continue; dd++;,) sum += $x(t=>ti); } PDL_IF_BAD(if (!dd) { $SETBAD(xd()); continue; },) $xd() = sum / PDL_IF_BAD(dd,$d()); %} } else { loop(t) %{ PDL_IF_BAD(if (t < min || t > max) { $SETBAD(xd()); continue; },) $GENERIC(x) sum = 0; PDL_IF_BAD(PDL_Indx dd = 0;,) for (i=-q; i<=q; i++) { PDL_Indx ti = (t+i < min)? min : (t+i > max)? max : t+i ; PDL_IF_BAD(if ($ISBAD($x(t=>ti))) continue; dd++;,) sum += (i == q || i == -q)? .5 * $x(t=>ti) : $x(t=>ti); } PDL_IF_BAD(if (!dd) { $SETBAD(xd()); continue; } dd--; if ( ($ISBAD(x(t=>t-q)) && $ISGOOD(x(t=>t+q)) ) || ($ISBAD(x(t=>t+q)) && $ISGOOD(x(t=>t-q)) ) ) dd += .5; ,) $xd() = sum / PDL_IF_BAD(dd,$d()); %} } ', Doc => 'Deseasonalize data using moving average filter the size of period d.', ); pp_def('fill_ma', Pars => 'x(t); indx q(); [o]xf(t)', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(x) sum, xx; PDL_Indx i, n, max = $SIZE(t) - 1; loop(t) %{ PDL_IF_BAD(if ($ISBAD(x())) { n=0; sum=0; for (i=-$q(); i<=$q(); i++) { xx = (t+i < 0)? $x(t=>0) : (t+i > max)? $x(t=>max) : $x(t=>t+i) ; if ($ISGOODVAR(xx,x)) { sum += xx; n ++; } } if (n) { $xf() = sum / n; } else { $SETBAD(xf()); } continue; },) $xf() = $x(); %} ', PMCode => pp_line_numbers(__LINE__, <<'EOF'), sub PDL::fill_ma { my ($x, $q) = @_; PDL::_fill_ma_int($x, $q, my $x_filled = PDL->null); $x_filled->check_badflag; # carp "ma window too small, still has bad value" # if $x_filled->badflag; return $x_filled; } EOF Doc => <<'EOD', =for ref Fill missing value with moving average. xf(t) = sum(x(t-q .. t-1, t+1 .. t+q)) / 2q. =for bad fill_ma does handle bad values. Output pdl bad flag is cleared unless the specified window size q is too small and there are still bad values. EOD ); pp_def('filter_exp', Pars => 'x(t); a(); [o]xf(t)', GenericTypes => $F, Code => ' $GENERIC(x) b, m; b = 1 - $a(); loop(t) %{ if (t) { m = $a() * $x() + b * m; } else { m = $x(); } $xf() = m; %} ', Doc => 'Filter, exponential smoothing. xf(t) = a * x(t) + (1-a) * xf(t-1)', ); pp_def('filter_ma', Pars => 'x(t); indx q(); [o]xf(t)', GenericTypes => $F, Code => ' $GENERIC(x) sum; PDL_Indx i, n, max; n = 2 * $q() + 1; max = $SIZE(t) - 1; loop(t) %{ sum = 0; for (i=-$q(); i<=$q(); i++) { sum += (t+i < 0)? $x(t=>0) : (t+i > max)? $x(t=>max) : $x(t=>t+i) ; } $xf() = sum / n; %} ', Doc => 'Filter, moving average. xf(t) = sum(x(t-q .. t+q)) / (2q + 1)', ); pp_def('mae', Pars => 'a(n); b(n); [o]c()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(c) sum; sum = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop(n) %{ PDL_IF_BAD(if ($ISBAD($a()) || $ISBAD(b())) continue; N++;,) sum += fabs( $a() - $b() ); %} if (N < 1) { $SETBAD(c()); continue; } $c() = sum / N; ', Doc => 'Mean absolute error. MAE = 1/n * sum( abs(y - y_pred) )', ); pp_def('mape', Pars => 'a(n); b(n); [o]c()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(c) sum; sum = 0; PDL_Indx N = PDL_IF_BAD(0,$SIZE(n)); loop(n) %{ PDL_IF_BAD(if ($ISBAD($a()) || $ISBAD(b())) continue; N++;,) sum += fabs( ($a() - $b()) / $a() ); %} if (N < 1) { $SETBAD(c()); continue; } $c() = sum / N; ', Doc => 'Mean absolute percent error. MAPE = 1/n * sum(abs((y - y_pred) / y))', ); pp_def('wmape', Pars => 'a(n); b(n); [o]c()', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(c) sum_e=0, sum=0; loop(n) %{ PDL_IF_BAD(if ($ISBAD($a()) || $ISBAD(b())) continue;,) sum_e += fabs( $a() - $b() ); sum += fabs( $a() ); %} if (!sum) { $SETBAD(c()); continue; } $c() = sum_e / sum; ', Doc => 'Weighted mean absolute percent error. avg(abs(error)) / avg(abs(data)). Much more robust compared to mape with division by zero error (cf. Schütz, W., & Kolassa, 2006).', ); pp_def('portmanteau', Pars => 'r(h); longlong t(); [o]Q()', GenericTypes => $F, Code => ' $GENERIC(r) sum; sum = 0; loop(h) %{ if (h) sum += $r()*$r() / ($t() - h); %} $Q() = $t() * ($t()+2) * sum; ', Doc => ' =for ref Portmanteau significance test (Ljung-Box) for autocorrelations. =for example Usage: pdl> $a = sequence 10 # acf for lags 0-5 # lag 0 excluded from portmanteau pdl> p $chisq = $a->acf(5)->portmanteau( $a->nelem ) 11.1753902662994 # get p-value from chisq distr pdl> use PDL::GSL::CDF pdl> p 1 - gsl_cdf_chisq_P( $chisq, 5 ) 0.0480112934306748 ', ); pp_def('pred_ar', Pars => 'x(p); b(p); [o]pred(t)', OtherPars => 'IV end=>t;', GenericTypes => $F, Code => ' PDL_Indx ord = $SIZE(p); $GENERIC(x) xt, xp[ord]; loop (t) %{ if (t < ord) { xp[t] = $x(p=>t); $pred() = xp[t]; } else { xt = 0; loop(p) %{ xt += xp[p] * $b(p=>ord-p-1); xp[p] = (p < ord - 1)? xp[p+1] : xt; %} $pred() = xt; } %} ', PMCode => pp_line_numbers(__LINE__, <<'EOF'), sub PDL::pred_ar { my ($x, $b, $t, $opt) = @_; my %opt = ( CONST => 1 ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } $b = PDL->topdl($b); # allows passing simple number my $ext; if ($opt{CONST}) { my $t_ = $t - ( $x->dim(0) - $b->dim(0) + 1 ); PDL::_pred_ar_int($x->slice([-$b->dim(0)+1,-1]), $b->slice('0:-2'), $ext = PDL->null, $t_); $ext->slice([$b->dim(0)-1,-1]) += $b->slice(-1); return $x->append( $ext->slice([$b->dim(0)-1,-1]) ); } else { my $t_ = $t - ( $x->dim(0) - $b->dim(0) ); PDL::_pred_ar_int($x->slice([-$b->dim(0),-1]), $b, $ext = PDL->null, $t_); return $x->append($ext->slice([$b->dim(0),-1])); } } EOF Doc => <<'EOD', =for ref Calculates predicted values up to period t (extend current series up to period t) for autoregressive series, with or without constant. If there is constant, it is the last element in b, as would be returned by ols or ols_t. pred_ar does not process bad values. =for options CONST => 1, =for example Usage: pdl> $x = sequence 2 # last element is constant pdl> $b = pdl(.8, -.2, .3) pdl> p $x->pred_ar($b, 7) [0 1 1.1 0.74 0.492 0.3656 0.31408] # no constant pdl> p $x->pred_ar($b(0:1), 7, {const=>0}) [0 1 0.8 0.44 0.192 0.0656 0.01408] EOD ); pp_addpm pp_line_numbers(__LINE__, <<'EOD'); =head2 season_m Given length of season, returns seasonal mean and variance for each period (returns seasonal mean only in scalar context). =for options Default options (case insensitive): START_POSITION => 0, # series starts at this position in season MISSING => -999, # internal mark for missing points in season PLOT => 0, # boolean # see PDL::Graphics::Simple for next options WIN => undef, # pass pgswin object for more plotting control COLOR => 1, =for usage my ($m, $ms) = $data->season_m( 24, { START_POSITION=>2 } ); =cut *season_m = \&PDL::season_m; sub PDL::season_m { my ($self, $d, $opt) = @_; my %opt = ( START_POSITION => 0, # series starts at this position in season MISSING => -999, # internal mark for missing points in season PLOT => 0, WIN => undef, # pass pgswin object for more plotting control COLOR => 1, ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } my $n_season = ($self->dim(0) + $opt{START_POSITION}) / $d; $n_season = pdl($n_season)->ceil->sum->sclr; my @dims = $self->dims; $dims[0] = $n_season * $d; my $data = zeroes( @dims ) + $opt{MISSING}; $data->slice([$opt{START_POSITION},$opt{START_POSITION} + $self->dim(0)-1]) .= $self; $data->badflag(1); $data->inplace->setvaltobad( $opt{MISSING} ); my $s = sequence $d; $s = $s->dummy(1, $n_season)->flat; $s = $s->iv_cluster(); my ($m, $ms) = $data->centroid( $s ); if ($opt{PLOT}) { require PDL::Graphics::Simple; my $w = $opt{WIN} || PDL::Graphics::Simple::pgswin(); my $seq = sequence($d); my $errb_length = sqrt( $ms / $s->sumover )->squeeze; my $col = $opt{COLOR}; my @plots = map +(with=>'lines', ke=>"Data $col", style=>$col++, $seq, $_), $m->dog; push @plots, with=>'errorbars', ke=>'Error', style=>$opt{COLOR}, $seq, $m->squeeze, $errb_length if $m->squeeze->ndims < 2 && ($errb_length > 0)->any; $w->plot(@plots, { xlabel=>'period', ylabel=>'mean' }); } return wantarray? ($m, $ms) : $m; } =head2 plot_dseason =for ref Plots deseasonalized data and original data points. Opens and closes default window for plotting unless a C object is passed in options. Returns deseasonalized data. =for options Default options (case insensitive): WIN => undef, COLOR => 1, # data point color =cut *plot_dseason = \&PDL::plot_dseason; sub PDL::plot_dseason { require PDL::Graphics::Simple; my ($self, $d, $opt) = @_; !defined($d) and croak "please set season period length"; $self = $self->squeeze; my %opt = ( WIN => undef, COLOR => 1, # data point color ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } my $dsea = $self->dseason($d); my $w = $opt{WIN} || PDL::Graphics::Simple::pgswin(); my $seq = sequence($self->dim(0)); my $col = $opt{COLOR}; my @plots = map +(with=>'lines', ke=>"Data $col", style=>$col++, $seq, $_), $dsea->dog; $col = $opt{COLOR}; push @plots, map +(with=>'points', ke=>"De-seasonalised $col", style=>$col++, $seq, $_), $self->dog; $w->plot(@plots, { xlabel=>'T', ylabel=>'DV' }); return $dsea; } =head1 METHODS =head2 plot_acf =for ref Plots and returns autocorrelations for a time series. =for options Default options (case insensitive): SIG => 0.05, # can specify .10, .05, .01, or .001 WIN => undef, =for usage Usage: pdl> $a = sequence 10 pdl> p $r = $a->plot_acf(5) [1 0.7 0.41212121 0.14848485 -0.078787879 -0.25757576] =cut *plot_acf = \&PDL::plot_acf; sub PDL::plot_acf { require PDL::Graphics::Simple; my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef; my ($self, $h) = @_; my $r = $self->acf($h); my %opt = ( SIG => 0.05, WIN => undef, ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } my $y_sig = ($opt{SIG} == 0.10)? 1.64485362695147 : ($opt{SIG} == 0.05)? 1.95996398454005 : ($opt{SIG} == 0.01)? 2.5758293035489 : ($opt{SIG} == 0.001)? 3.29052673149193 : 0 ; unless ($y_sig) { carp "SIG outside of recognized value. default to 0.05"; $y_sig = 1.95996398454005; } my $w = $opt{WIN} || PDL::Graphics::Simple::pgswin(); my $seq = pdl(-1,$h+1); my $y_seq = ones(2) * $y_sig / sqrt($self->dim(0)) * -1; $w->plot( with=>'lines', $seq, zeroes(2), # x axis with=>'lines', style=>2, $seq, $y_seq, with=>'lines', style=>2, $seq, -$y_seq, (map +(with=>'lines', ones(2)*$_, pdl(0, $r->slice("($_)"))), 0..$h), { xlabel=>'lag', ylabel=>'acf', } ); $r; } =head1 REFERENCES Brockwell, P.J., & Davis, R.A. (2002). Introduction to Time Series and Forecasting (2nd ed.). New York, NY: Springer. Schütz, W., & Kolassa, S. (2006). Foresight: advantages of the MAD/Mean ratio over the MAPE. Retrieved Jan 28, 2010, from http://www.saf-ag.com/226+M5965d28cd19.html =head1 AUTHOR Copyright (C) 2009 Maggie J. Xiong All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution. =cut EOD pp_done(); PDL-Stats-0.855/lib/PDL/Stats/Kmeans.pd0000644000175000017500000005303314757253725017247 0ustar osboxesosboxesuse strict; use warnings; my $F = [qw(F D)]; pp_add_exported('', 'random_cluster', 'iv_cluster'); pp_addpm({At=>'Top'}, <<'EOD'); use strict; use warnings; use Carp; use PDL::LiteF; use PDL::Stats::Basic; =head1 NAME PDL::Stats::Kmeans -- classic k-means cluster analysis =head1 DESCRIPTION Assumes that we have data pdl dim [observation, variable] and the goal is to put observations into clusters based on their values on the variables. The terms "observation" and "variable" are quite arbitrary but serve as a reminder for "that which is being clustered" and "that which is used to cluster". The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are broadcastable and methods that are non-broadcastable, respectively. =head1 SYNOPSIS Implement a basic k-means procedure, use PDL::LiteF; use PDL::Stats; my ($data, $idv, $ido) = rtable( $file ); # or generate random data: $data = grandom(200, 2); # two vars as below my ($cluster, $centroid, $ss_centroid, $cluster_last); # start out with 8 random clusters $cluster = random_cluster( $data->dim(0), 8 ); # iterate to minimize total ss # stop when no more changes in cluster membership do { $cluster_last = $cluster; ($centroid, $ss_centroid) = $data->centroid( $cluster ); $cluster = $data->assign( $centroid ); } while sum(abs($cluster - $cluster_last)) > 0; or, use the B function provided here, my %k = $data->kmeans( \%opt ); print "$_\t$k{$_}\n" for sort keys %k; plot the clusters if there are only 2 vars in $data, use PDL::Graphics::Simple; my ($win, $c); $win = pgswin(); $win->plot(map +(with=>'points', $data->dice_axis(0,which($k{cluster}->(,$_)))->dog), 0 .. $k{cluster}->dim(1)-1); =cut EOD pp_addhdr(' #include #include #include ' ); pp_addpm pp_line_numbers(__LINE__, <<'EOD' ); =head2 random_cluster =for ref Creates masks for random mutually exclusive clusters. Accepts two parameters, num_obs and num_cluster. Extra parameter turns into extra dim in mask. May loop a long time if num_cluster approaches num_obs because empty cluster is not allowed. =for usage my $cluster = random_cluster( $num_obs, $num_cluster ); =cut # can't be called on pdl sub random_cluster { my ($obs, $clu, @extra) = @_; # extra param in @_ made into extra dim my $cluster = zeroes short(), @_; do { (random($obs, @extra) * $obs)->_random_cluster($cluster); } while (PDL::any $cluster->sumover == 0 ); $cluster; } EOD pp_def('_random_cluster', Pars => 'rand_data(o); [o]b(o,c)', GenericTypes => ['S'], RedoDimsCode => 'if ($SIZE(c) > $SIZE(o)) $CROAK("more cluster than obs!");', Code => ' PDL_Indx nc = $SIZE(c); broadcastloop %{ loop (o) %{ PDL_Indx cl = $rand_data() % nc; loop (c) %{ $b() = (c == cl) ? 1 : 0; %} %} %} ', Doc => undef, ); pp_def('which_cluster', Pars => 'short a(o,c); indx [o]b(o)', GenericTypes => [qw(U L)], HandleBad => 1, Code => ' loop(o) %{ PDL_Indx cl=-1; loop(c) %{ PDL_IF_BAD(if ($ISBAD($a())) continue;,) if (!$a()) continue; cl = c; break; %} PDL_IF_BAD(if (cl==-1) { $SETBAD(b()); continue; },) $b() = cl; %} ', Doc => 'Given cluster mask dim [obs x clu], returns the cluster index to which an obs belong. Does not support overlapping clusters. If an obs has TRUE value for multiple clusters, the returned index is the first cluster the obs belongs to. If an obs has no TRUE value for any cluster, the return val is set to -1 or BAD if the input mask has badflag set. Usage: # create a cluster mask dim [obs x clu] pdl> p $c_mask = iv_cluster [qw(a a b b c c)] [ [1 1 0 0 0 0] [0 0 1 1 0 0] [0 0 0 0 1 1] ] # get cluster membership list dim [obs] pdl> p $ic = $c_mask->which_cluster [0 0 1 1 2 2] ', ); pp_def('assign', Pars => 'data(o,v); centroid(c,v); short [o]cluster(o,c)', GenericTypes => $F, HandleBad => 1, Code => ' PDL_Indx cl = 0; loop (o) %{ $GENERIC(centroid) ssmin = -1; loop (c) %{ $GENERIC(centroid) ssc = 0; PDL_Indx nvc = 0; loop (v) %{ PDL_IF_BAD(if ($ISBAD($data()) || $ISBAD($centroid())) continue;,) $GENERIC() diff = $data() - $centroid(); ssc += diff*diff; nvc ++; %} if (nvc) { ssc /= nvc; } else { /* taking advantage of the fact that 1st valid ssmin takes precedence */ /* so ssc has no effect if there is already ssmin. or it is -1 */ ssc = ssmin; } /* notice that if multiple ssc == ssmin the 1st is taken as cluster */ if (ssmin < 0 || ssmin > ssc) { cl = c; ssmin = ssc; } %} loop (c) %{ PDL_IF_BAD(if (ssmin < 0) { $SETBAD($cluster()); continue; },) $cluster() = (c == cl)? 1 : 0; %} %} ', Doc => ' =for ref Takes data pdl dim [obs x var] and centroid pdl dim [cluster x var] and returns mask dim [obs x cluster] to cluster membership. An obs is assigned to the first cluster with the smallest distance (ie sum squared error) to cluster centroid. With bad value, obs is assigned by smallest mean squared error across variables. =for usage pdl> p $centroid = xvals 2, 3 [ [0 1] [0 1] [0 1] ] pdl> p $b = qsort( random 4, 3 ) [ [0.022774068 0.032513883 0.13890034 0.30942479] [ 0.16943853 0.50262636 0.56251531 0.7152271] [ 0.23964483 0.59932745 0.60967495 0.78452117] ] # notice that 1st 3 obs in $b are on average closer to 0 # and last obs closer to 1 pdl> p $b->assign( $centroid ) [ [1 1 1 0] # cluster 0 membership [0 0 0 1] # cluster 1 membership ] ', ); pp_def('centroid', Pars => 'data(o,v); cluster(o,c); [o]m(c,v); [o]ss(c,v)', GenericTypes => $F, HandleBad => 1, Code => ' $GENERIC(m) s[ $SIZE(c) ][ $SIZE(v) ], s2[ $SIZE(c) ][ $SIZE(v) ]; PDL_Indx n[ $SIZE(c) ]PDL_IF_BAD([ $SIZE(v) ],); loop (c) %{ loop (v) %{ s[c][v] = 0.0; s2[c][v] = 0.0; PDL_IF_BAD(n[c][v] = 0;,) %} PDL_IF_BAD(,n[c] = 0;) loop (o) %{ PDL_IF_BAD(if ($ISBAD($cluster())) continue;,) if (!$cluster()) continue; PDL_IF_BAD(,n[c]++;) loop (v) %{ PDL_IF_BAD(if ($ISBAD($data())) continue;,) s[c][v] += $data(); s2[c][v] += $data()*$data(); PDL_IF_BAD(n[c][v]++;,) %} %} loop (v) %{ if (n[c]PDL_IF_BAD([v],)) { $m() = s[c][v] / n[c]PDL_IF_BAD([v],); $ss() = s2[c][v] PDL_IF_BAD(/ n[c][v],) - pow(s[c][v] / n[c]PDL_IF_BAD([v],), 2) PDL_IF_BAD(,* n[c]); } else { $m() = 0; $ss() = 0; /* $CROAK("please make sure there is no empty cluster!"); */ } %} %} ', Doc => ' =for ref Takes data dim [obs x var] and mask dim [obs x cluster], returns mean and ss (ms when data contains bad values) dim [cluster x var], using data where mask == 1. Multiple cluster membership for an obs is okay. If a cluster is empty all means and ss are set to zero for that cluster. =for usage # data is 10 obs x 3 var pdl> p $d = sequence 10, 3 [ [ 0 1 2 3 4 5 6 7 8 9] [10 11 12 13 14 15 16 17 18 19] [20 21 22 23 24 25 26 27 28 29] ] # create two clusters by value on 1st var pdl> p $a = $d( ,(0)) <= 5 [1 1 1 1 1 1 0 0 0 0] pdl> p $b = $d( ,(0)) > 5 [0 0 0 0 0 0 1 1 1 1] pdl> p $c = cat $a, $b [ [1 1 1 1 1 1 0 0 0 0] [0 0 0 0 0 0 1 1 1 1] ] pdl> p $d->centroid($c) # mean for 2 cluster x 3 var [ [ 2.5 7.5] [12.5 17.5] [22.5 27.5] ] # ss for 2 cluster x 3 var [ [17.5 5] [17.5 5] [17.5 5] ] ', ); pp_addpm pp_line_numbers(__LINE__, <<'EOD'); sub _scree_ind { # use as scree cutoff the point with max distance to the line formed # by the 1st and last points in $self # it's a heuristic--whether we can get "good" results depends on # the number of components in $self. my ($self) = @_; $self = $self->squeeze; $self->ndims > 1 and croak "1D pdl only please"; my $a = zeroes 2, $self->nelem; $a->slice('(0)') .= sequence $self->nelem; $a->slice('(1)') .= $self; my $d = _d_point2line( $a, $a->slice(':,(0)'), $a->slice(':,(-1)') ); return $d->maximum_ind; } sub _d_point2line { my ($self, $p1, $p2) = @_; for ($self, $p1, $p2) { $_->dim(0) != 2 and carp "point pdl dim(0) != 2"; } return _d_p2l( $self->mv(0,-1)->dog, $p1->mv(0,-1)->dog, $p2->mv(0,-1)->dog ); } EOD pp_def('_d_p2l', Pars => 'xc(); yc(); xa(); ya(); xb(); yb(); [o]d()', GenericTypes => $F, HandleBad => 1, Code => ' PDL_IF_BAD(if ($ISBAD(xc()) || $ISBAD(yc()) || $ISBAD(xa()) || $ISBAD(ya()) || $ISBAD(xb()) || $ISBAD(yb()) ) { $SETBAD(d()); continue; },) $GENERIC(d) xba, yba; xba = $xb() - $xa(); yba = $yb() - $ya(); $d() = fabs( xba * ($ya() - $yc()) - ($xa() - $xc()) * yba ) / hypot( xba, yba ); ', Doc => undef, ); pp_addpm pp_line_numbers(__LINE__, <<'EOD'); =head2 kmeans =for ref Implements classic k-means cluster analysis. =for example $data = grandom(200, 2); # two rows = two dimensions %k = $data->kmeans; # use default of 3 clusters print "$_\t$k{$_}\n" for sort keys %k; $w->plot( (map +(with=>'points', style=>$_+1, ke=>"Cluster ".($_+1), $data->dice_axis(0,which($k{cluster}->slice(",$_")))->dog), 0 .. $k{cluster}->dim(1)-1), (map +(with=>'circles', style=>$_+1, ke=>"Centroid ".($_+1), $k{centroid}->slice($_)->dog, 0.1), 0 .. $k{centroid}->dim(0)-1), {le=>'tr'}, ); Given a number of observations with values on a set of variables, kmeans puts the observations into clusters that maximizes within-cluster similarity with respect to the variables. Tries several different random seeding and clustering in parallel. Stops when cluster assignment of the observations no longer changes. Returns the best result in terms of R2 from the random-seeding trials. Instead of random seeding, kmeans also accepts manual seeding. This is done by providing a centroid to the function, in which case clustering will proceed from the centroid and there is no multiple tries. There are two distinct advantages from seeding with a centroid compared to seeding with predefined cluster membership of a subset of the observations ie "seeds": =over =item * a centroid could come from a previous study with a different set of observations; =item * a centroid could even be "fictional", or in more proper parlance, an idealized prototype with respect to the actual data. For example, if there are 10 person's ratings of 1 to 5 on 4 movies, ie a ratings pdl of dim [10 obs x 4 var], providing a centroid like [ [5 0 0 0] [0 5 0 0] [0 0 5 0] [0 0 0 5] ] will produce 4 clusters of people with each cluster favoring a different one of the 4 movies. Clusters from an idealized centroid may not give the best result in terms of R2, but they sure are a lot more interpretable. =back If clustering has to be done from predefined clusters of seeds, simply calculate the centroid using the B function and feed it to kmeans, my ($centroid, $ss) = $rating($iseeds, )->centroid( $seeds_cluster ); my %k = $rating->kmeans( { CNTRD=>$centroid } ); kmeans supports bad value*. =for options Default options (case insensitive): V => 1, # prints simple status FULL => 0, # returns results for all seeding trials CNTRD => PDL->null, # optional. pdl [clu x var]. disables next 3 opts NTRY => 5, # num of random seeding trials NSEED => 1000, # num of initial seeds, use NSEED up to max obs NCLUS => 3, # num of clusters =for usage Usage: # suppose we have 4 person's ratings on 5 movies pdl> p $rating = ceil( random(4, 5) * 5 ) [ [3 2 2 3] [2 4 5 4] [5 3 2 3] [3 3 1 5] [4 3 3 2] ] # we want to put the 4 persons into 2 groups pdl> %k = $rating->kmeans( {NCLUS=>2} ) # by default prints back options used # as well as info for all tries and iterations CNTRD => Null FULL => 0 NCLUS => 3 NSEED => 4 NTRY => 5 V => 1 ss total: 20.5 iter 0 R2 [0.024390244 0.024390244 0.26829268 0.4796748 0.4796748] iter 1 R2 [0.46341463 0.46341463 0.4796748 0.4796748 0.4796748] pdl> p "$_\t$k{$_}\n" for sort keys %k R2 0.479674796747968 centroid # mean ratings for 2 group x 5 movies [ [ 3 2.3333333] [ 2 4.3333333] [ 5 2.6666667] [ 3 3] [ 4 2.6666667] ] cluster # 4 persons' membership in two groups [ [1 0 0 0] [0 1 1 1] ] n [1 3] # cluster size ss [ [ 0 0.66666667] [ 0 0.66666667] [ 0 0.66666667] [ 0 8] [ 0 0.66666667] ] Now, for the valiant, kmeans is broadcastable. Say you gathered 10 persons' ratings on 5 movies from 2 countries, so the data is dim [10,5,2], and you want to put the 10 persons from each country into 3 clusters, just specify NCLUS => [3,1], and there you have it. The key is for NCLUS to include $data->ndims - 1 numbers. The 1 in [3,1] turns into a dummy dim, so the 3-cluster operation is repeated on both countries. Similarly, when seeding, CNTRD needs to have ndims that at least match the data ndims. Extra dims in CNTRD will lead to broadcasting (convenient if you want to try out different centroid locations, for example, but you will have to hand pick the best result). See F for examples w 3D and 4D data. *With bad value, R2 is based on average of variances instead of sum squared error. =cut *kmeans = \&PDL::kmeans; sub PDL::kmeans { my ($self, $opt) = @_; my %opt = ( V => 1, # prints simple status FULL => 0, # returns results for all seeding trials CNTRD => PDL->null, # optional. pdl [clu x var]. disables next 3 opts NTRY => 5, # num of random seeding trials NSEED => 1000, # num of initial seeds, use NSEED up to max obs NCLUS => 3, # num of clusters ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } if (defined($opt{CNTRD}) and $opt{CNTRD}->nelem) { $opt{NTRY} = 1; $opt{NSEED} = $self->dim(0); $opt{NCLUS} = $opt{CNTRD}->dim(0); } else { $opt{NSEED} = pdl($self->dim(0), $opt{NSEED})->min->sclr; } $opt{V} and print "$_\t=> $opt{$_}\n" for sort keys %opt; my $ss_ms = $self->badflag? 'ms' : 'ss'; my $ss_total = $self->badflag? $self->var->average : $self->ss->sumover; $opt{V} and print "overall $ss_ms:\t$ss_total\n"; my ($centroid, $ss_cv, $R2, $clus_this, $clus_last); # NTRY made into extra dim in $cluster for broadcasting my @nclus = (ref $opt{NCLUS} eq 'ARRAY')? @{$opt{NCLUS}} : ($opt{NCLUS}); $clus_this = (defined($opt{CNTRD}) and $opt{CNTRD}->nelem) ? $self->assign( $opt{CNTRD}->dummy(-1) ) # put dummy(-1) to match NTRY : random_cluster($opt{NSEED}, @nclus, $opt{NTRY} ) ; ($centroid, $ss_cv) = $self->slice([0,$opt{NSEED} - 1])->centroid( $clus_this ); # now obs in $clus_this matches $self $clus_this = $self->assign( $centroid ); ($centroid, $ss_cv) = $self->centroid( $clus_this ); my $iter = 0; do { $R2 = $self->badflag? 1 - $ss_cv->average->average / $ss_total : 1 - $ss_cv->sumover->sumover / $ss_total ; $opt{V} and print join(' ',('iter', $iter++, 'R2', $R2)) . "\n"; $clus_last = $clus_this; $clus_this = $self->assign( $centroid ); ($centroid, $ss_cv) = $self->centroid( $clus_this ); } while ( any long(abs($clus_this - $clus_last))->sumover->sumover > 0 ); $opt{FULL} and return ( centroid => PDL::squeeze( $centroid ), cluster => PDL::squeeze( $clus_this ), n => PDL::squeeze( $clus_this )->sumover, R2 => PDL::squeeze( $R2 ), $ss_ms => PDL::squeeze( $ss_cv ), ); # xchg/mv(-1,0) leaves it as was if single dim--unlike transpose my $i_best = $R2->mv(-1,0)->maximum_ind; $R2->getndims == 1 and return ( centroid => $centroid->dice_axis(-1,$i_best)->sever->squeeze, cluster => $clus_this->dice_axis(-1,$i_best)->sever->squeeze, n => $clus_this->dice_axis(-1,$i_best)->sever->squeeze->sumover, R2 => $R2->dice_axis(-1,$i_best)->sever->squeeze, $ss_ms => $ss_cv->dice_axis(-1,$i_best)->sever->squeeze, ); # now for broadcasting beyond 2D data # can't believe i'm using a perl loop :P $i_best = $i_best->flat->sever; my @i_best = map { $opt{NTRY} * $_ + $i_best->slice("($_)") } 0 .. $i_best->nelem - 1; my @shapes; for ($centroid, $clus_this, $R2) { my @dims = $_->dims; pop @dims; push @shapes, \@dims; } $clus_this = $clus_this->mv(-1,2)->clump(2..$clus_this->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[1] } )->sever, return ( centroid => $centroid->mv(-1,2)->clump(2..$centroid->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[0] } )->sever, cluster => $clus_this, n => $clus_this->sumover, R2 => $R2->mv(-1,0)->clump(0..$R2->ndims-1)->dice_axis(0,\@i_best)->sever->reshape( @{ $shapes[2] } )->sever, $ss_ms => $ss_cv->mv(-1,2)->clump(2..$ss_cv->ndims-1)->dice_axis(2,\@i_best)->sever->reshape( @{ $shapes[0] } )->sever, ); } =head1 METHODS =head2 iv_cluster =for ref Turns an independent variable into a cluster pdl. Returns cluster pdl and level-to-pdl_index mapping in list context and cluster pdl only in scalar context. This is the method used for mean and var in anova. The difference between iv_cluster and dummy_code is that iv_cluster returns pdl dim [obs x level] whereas dummy_code returns pdl dim [obs x (level - 1)]. =for usage Usage: pdl> @bake = qw( y y y n n n ) # accepts @ ref or 1d pdl pdl> p $bake = iv_cluster( \@bake ) [ [1 1 1 0 0 0] [0 0 0 1 1 1] ] pdl> p $rating = sequence 6 [0 1 2 3 4 5] pdl> p $rating->centroid( $bake ) # mean for each iv level [ [1 4] ] # ss [ [2 2] ] =cut *iv_cluster = \&PDL::iv_cluster; sub PDL::iv_cluster { my ($var_ref) = @_; my ($var, $map_ref) = PDL::Stats::Basic::code_ivs( $var_ref ); my $var_a = yvals( short, $var->nelem, $var->max->sclr + 1 ) == $var; $var_a = $var_a->setbadif( $var->isbad ) if $var->badflag; wantarray ? ($var_a, $map_ref) : $var_a; } =head2 pca_cluster Assign variables to components ie clusters based on pca loadings or scores. One way to seed kmeans (see Ding & He, 2004, and Su & Dy, 2004 for other ways of using pca with kmeans). Variables are assigned to their most associated component. Note that some components may not have any variable that is most associated with them, so the returned number of clusters may be smaller than NCOMP. Default options (case insensitive): V => 1, ABS => 1, # high pos and neg loadings on a comp in same cluster NCOMP => undef, # max number of components to consider. determined by # scree plot black magic if not specified PLOT => 0, # pca scree plot with cutoff at NCOMP WIN => undef, # pass pgswin object for more plotting control Usage: # say we need to cluster a group of documents # $data is pdl dim [word x doc] ($data, $idd, $idw) = get_data 'doc_word_info.txt'; pdl> %p = $data->pca; # $cluster is pdl mask dim [doc x ncomp] pdl> $cluster = $p{loading}->pca_cluster; # pca clusters var while kmeans clusters obs. hence transpose pdl> ($m, $ss) = $data->transpose->centroid( $cluster ); pdl> %k = $data->transpose->kmeans( { cntrd=>$m } ); # take a look at cluster 0 doc ids pdl> p join("\n", @$idd[ list which $k{cluster}->( ,0) ]); =cut *pca_cluster = \&PDL::pca_cluster; sub PDL::pca_cluster { my ($self, $opt) = @_; my %opt = ( V => 1, ABS => 1, # high pos and neg loadings on a comp in same cluster NCOMP => undef, # max number of components to consider. determined by # scree plot black magic if not specified PLOT => 0, # pca scree plot with cutoff at NCOMP WIN => undef, # pass pgswin object for more plotting control ); if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; } my $var = sumover($self ** 2) / $self->dim(0); if (!$opt{NCOMP}) { # here's the black magic part my $comps = ($self->dim(1) > 300)? int($self->dim(1) * .1) : pdl($self->dim(1), 30)->min ; $var = $var->slice([0,$comps-1])->sever; $opt{NCOMP} = _scree_ind( $var ); } $opt{PLOT} and do { require PDL::Stats::GLM; $var->plot_screes({NCOMP=>$var->dim(0), CUT=>$opt{NCOMP}, WIN=>$opt{WIN}}); }; my $c = $self->slice(':',[0,$opt{NCOMP}-1])->transpose->abs->maximum_ind; if ($opt{ABS}) { $c = $c->iv_cluster; } else { my @c = map { ($self->slice($_,$c->slice($_)) >= 0)? $c->slice($_)*2 : $c->slice($_)*2 + 1 } ( 0 .. $c->dim(0)-1 ); $c = iv_cluster( \@c ); } $opt{V} and print "cluster membership mask as " . $c->info . "\n"; return $c; } =head1 REFERENCES Ding, C., & He, X. (2004). K-means clustering via principal component analysis. Proceedings of the 21st International Conference on Machine Learning, 69, 29. Su, T., & Dy, J. (2004). A deterministic method for initializing K-means clustering. 16th IEEE International Conference on Tools with Artificial Intelligence, 784-786. Romesburg, H.C. (1984). Cluster Analysis for Researchers. NC: Lulu Press. Wikipedia (retrieved June, 2009). K-means clustering. http://en.wikipedia.org/wiki/K-means_algorithm =head1 AUTHOR Copyright (C) 2009 Maggie J. Xiong All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution. =cut EOD pp_done(); PDL-Stats-0.855/lib/PDL/Demos/0000755000175000017500000000000014762214364015441 5ustar osboxesosboxesPDL-Stats-0.855/lib/PDL/Demos/Stats.pm0000644000175000017500000000670414757260204017102 0ustar osboxesosboxespackage PDL::Demos::Stats; use PDL::Graphics::Simple; sub info {('stats', 'Statistics, linear modelling (Req.: PDL::Graphics::Simple)')} sub init {' use PDL::Graphics::Simple; '} my @demo = ( [act => q| # This demo illustrates the PDL::Stats module, # which lets you analyse statistical data in a number of ways. use PDL::Stats; $w = pgswin(); # PDL::Graphics::Simple window srandom(5); # for reproducibility |], [act => q| # First, PDL::Stats::TS - let's show three sets of random data, against # the de-seasonalised version $data = random(12, 3); $data->plot_dseason( 12, { win=>$w } ); |], [act => q| # Now let's show the seasonal means of that data ($m, $ms) = $data->season_m( 6, { plot=>1, win=>$w } ); print "m=$m\nms=$ms"; |], [act => q| # Now, auto-correlation of a random sound-sample. # See https://pdl.perl.org/advent/blog/2024/12/15/pitch-detection/ for more! random(100)->plot_acf( 50, { win=>$w } ); |], [act => q| # PDL::Stats::Kmeans clusters data points into "k" (a supplied number) groups $data = grandom(200, 2); # two rows = two dimensions %k = $data->kmeans; # use default of 3 clusters print "$_\t@{[$k{$_} =~ /^\n*(.*?)\n*\z/s]}\n" for sort keys %k; $w->plot( (map +(with=>'points', style=>$_+1, ke=>"Cluster ".($_+1), $data->dice_axis(0,which($k{cluster}->slice(",$_")))->dog), 0 .. $k{cluster}->dim(1)-1), (map +(with=>'circles', style=>$_+1, ke=>"Centroid ".($_+1), $k{centroid}->slice($_)->dog, 0.1), 0 .. $k{centroid}->dim(0)-1), {le=>'tr'}, ); |], [act => q| # There's also a principal component analysis (PCA) clustering function $data = qsort random 10, 5; # 10 obs on 5 variables %r = $data->pca( { plot=>1, win=>$w } ); # Here we can see that very nearly all the variance is in the first component. |], [act => q| # From that PCA we can plot the original vs PCA-transformed scores # along the first two components $data->plot_scores( $r{eigenvector}, {win=>$w} ); |], [act => q{ # Suppose this is a person's ratings for top 10 box office movies # ascending sorted by box office $y = pdl '[1 1 2 2 2 2 4 4 5 5]'; $x = cat sequence(10), sequence(10)**2; # IV with linear and quadratic component # We do an ordinary least squares (OLS), or multiple linear regression, # to get the underlying linear model. Here we also plot how far the real # data was from our calculated model. %m = $y->ols( $x, { plot=>1, win=>$w } ); print "$_\t@{[$m{$_} =~ /^\n*(.*?)\n*\z/s]}\n" for sort keys %m; }], [act => q{ $y = pdl '[1 1 2 2 3 3 3 3 4 5 5 5]'; # suppose this is ratings for 12 apples $a = pdl '[1 2 3 1 2 3 1 2 3 1 2 3]'; # IV for types of apple @b = qw( y y y y y y n n n n n n ); # IV for whether we baked the apple # First let's look at the raw data, categorised in each independent variable: $y->plot_stripchart( $a, \@b, { IVNM=>[qw(apple bake)], win=>$w } ); # Looks like there's a visible partition in the "bake" IV }], [act => q{ # Let's try the analysis of variance (ANOVA) in PDL::Stats::GLM %m = $y->anova( $a, \@b, { IVNM=>[qw(apple bake)], plot=>0, win=>$w } ); print "$_\t@{[$m{$_} =~ /^\n*(.*?)\n*\z/s]}\n" for sort keys %m; # The p-value of variance explained by "bake" is ~0.015 - significant # Let's plot the means of the interaction of all IVs $m{'| apple ~ bake | m'}->plot_means($m{'| apple ~ bake | se'}, { IVNM=>[qw(apple bake)], plot=>1, win=>$w }); }], [comment => q| This concludes the demo. Be sure to check the documentation for PDL::Stats, to see further possibilities. |], ); sub demo { @demo } sub done {' undef $w; '} 1; PDL-Stats-0.855/MANIFEST0000644000175000017500000000152614762214365014343 0ustar osboxesosboxesChanges lib/PDL/Demos/Stats.pm lib/PDL/Stats.pm lib/PDL/Stats/Basic.pd lib/PDL/Stats/GLM.pd lib/PDL/Stats/Kmeans.pd lib/PDL/Stats/TS.pd Makefile.PL MANIFEST This list of files README.md t/00-report-prereqs.t t/basic.t t/glm.t t/kmeans.t t/ts.t META.yml Module YAML meta-data (added by MakeMaker) META.json Module JSON meta-data (added by MakeMaker) GENERATED/PDL/Stats/Basic.pm mod=PDL::Stats::Basic pd=lib/PDL/Stats/Basic.pd (added by pdlpp_mkgen) GENERATED/PDL/Stats/GLM.pm mod=PDL::Stats::GLM pd=lib/PDL/Stats/GLM.pd (added by pdlpp_mkgen) GENERATED/PDL/Stats/Kmeans.pm mod=PDL::Stats::Kmeans pd=lib/PDL/Stats/Kmeans.pd (added by pdlpp_mkgen) GENERATED/PDL/Stats/TS.pm mod=PDL::Stats::TS pd=lib/PDL/Stats/TS.pd (added by pdlpp_mkgen) PDL-Stats-0.855/Makefile.PL0000644000175000017500000000327614754014550015163 0ustar osboxesosboxesuse strict; use warnings; use ExtUtils::MakeMaker; use PDL::Core::Dev; my $package_name = "PDL::Stats"; (my $repo = $package_name) =~ s#::#-#g; $repo = "PDLPorters/$repo"; WriteMakefile( NAME => $package_name, AUTHOR => 'Maggie J. Xiong ', VERSION_FROM => 'lib/PDL/Stats.pm', ABSTRACT_FROM => 'lib/PDL/Stats.pm', MIN_PERL_VERSION => '5.016', # https://github.com/Perl/perl5/issues/11835 LICENSE=> 'perl', PREREQ_PM => { 'PDL' => '2.099', # badflag propagation fixed }, CONFIGURE_REQUIRES => { 'PDL' => '2.099', }, BUILD_REQUIRES => { 'PDL' => '2.099', }, TEST_REQUIRES => { 'Test::More' => '0.88', # done_testing 'Test::PDL' => '0.21', }, clean => { FILES => ['PDL-Stats-*'] }, META_MERGE => { "meta-spec" => { version => 2 }, prereqs => { develop => { requires => { 'CPAN::Changes' => 0, }, }, runtime => { recommends => { 'PDL::Graphics::Simple' => 0, 'PDL::GSL' => 0, }, }, }, resources => { bugtracker => {web=>"https://github.com/$repo/issues"}, repository => { type => 'git', url => "git://github.com/$repo.git", web => "https://github.com/$repo", }, x_IRC => 'irc://irc.perl.org/#pdl', }, }, ); { my @pd_srcs; package MY; # so that "SUPER" works right sub init_PM { my ($self) = @_; $self->SUPER::init_PM; @pd_srcs = ::pdlpp_eumm_update_deep($self); } sub postamble { ::pdlpp_postamble(@pd_srcs) } }