pspp-dev
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: Covariance Matrix


From: John Darrington
Subject: Re: Covariance Matrix
Date: Tue, 6 Oct 2009 19:43:06 +0000
User-agent: Mutt/1.5.18 (2008-05-17)

Thanks for doing this.  I think I see what this is doing.  I have a few 
comments:

1.  It provokes some compiler warnings (which can be trivially fixed).

2.  I know we've discussed this before, but I'm still not comfortable
    with the implicit assumption that catagorical variables are always
    non-numeric variables.  Although I accept that using numeric 
    variables as categorical ones is perhaps inadvisable, it is a legacy
    that we are stuck with.  Both the T-TEST and ONEWAY commands permit
    the categorical variable to be numeric (in fact, I think ONEWAY insists
    upon it).   So I think we have to support numerical categorical 
    variables.

3.  Perhaps related to 2.  Our current practice of storing the categorical
    state in the variables' aux parameter is problematic, and the get_dim
    function reminds us of this.  The loop in get_dim should not be necessary,
    since the total number of categories has already been calculated.  I suggest
    that we change the interface and implementation of src/data/category.[ch] 
    such that it stores its state in a separate object.  Although it means a 
    little more work for the caller, I think the benefits will be worthwhile.


Anyway, I suggest that we create a new branch and check in Jason's patch into it
and work on a categorical covariance implementation within that branch.


J'
    

On Tue, Oct 06, 2009 at 11:37:40AM -0400, Jason Stover wrote:
     On Sun, Oct 04, 2009 at 03:21:51PM +0000, John Darrington wrote:
     > It's currently implemented as a single pass algorithm, but it will
     > be straightforward to change that.  There's no categorical variables, 
     > or interactions, at present.
     > 
     ...
     > I'd be interested in any comments and suggestions on how to proceed
     > with generalising the implementation to accept categorical variables.
     
     I have a patch below to start this. Right now, it just allocates space. It 
does
     not properly retrieve entries with categorical variables, but it is a 
start:
     
     diff --git a/src/language/stats/correlations.c 
b/src/language/stats/correlations.c
     index e397dae..65679f3 100644
     --- a/src/language/stats/correlations.c
     +++ b/src/language/stats/correlations.c
     @@ -324,7 +324,7 @@ run_corr (struct casereader *r, const struct corr_opts 
*opts, const struct corr
        const gsl_matrix *var_matrix,  *samples_matrix, *mean_matrix;
        const gsl_matrix *cov_matrix;
        gsl_matrix *corr_matrix;
     -  struct covariance *cov = covariance_create (corr->n_vars_total, 
corr->vars,
     +  struct covariance *cov = covariance_create_pass1 (corr->n_vars_total, 
corr->vars,
                                              opts->wv, opts->exclude);
      
        for ( ; (c = casereader_read (r) ); case_unref (c))
     diff --git a/src/math/covariance.c b/src/math/covariance.c
     index ba0de0b..350cab4 100644
     --- a/src/math/covariance.c
     +++ b/src/math/covariance.c
     @@ -31,6 +31,7 @@ struct covariance
      {
        /* The variables for which the covariance matrix is to be calculated */
        size_t n_vars;
     +  size_t dim; /* This value equals n_vars if all variables are 
continuous. */
        const struct variable **vars;
        
        /* The weight variable (or NULL if none) */
     @@ -64,11 +65,29 @@ covariance_moments (const struct covariance *cov, int 
m)
      }
      
      
     +static void
     +covariance_create_part2 (struct covariance *cov, enum mv_class exclude)
     +{
     +  size_t i;
     +
     +  cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS);
     +
     +  for (i = 0; i < n_MOMENTS; ++i)
     +    cov->moments[i] = gsl_matrix_calloc (cov->dim, cov->dim);
      
     -/* Create a covariance struct */
     +  cov->exclude = exclude;
     +
     +  cov->n_cm = (cov->dim * (cov->dim - 1)  ) / 2;
     +
     +  cov->cm = xcalloc (sizeof *cov->cm, cov->n_cm);
     +}
     +  
     +/* Create a covariance struct to be computed in one data pass.
     +   No categorical variables are allowed. 
     +*/
      struct covariance *
     -covariance_create (size_t n_vars, const struct variable **vars,
     -             const struct variable *weight, enum mv_class exclude)
     +covariance_create_pass1 (size_t n_vars, const struct variable **vars,
     +                   const struct variable *weight, enum mv_class exclude)
      {
        size_t i;
        struct covariance *cov = xmalloc (sizeof *cov);
     @@ -76,20 +95,61 @@ covariance_create (size_t n_vars, const struct 
variable **vars,
      
        cov->wv = weight;
        cov->n_vars = n_vars;
     +  cov->dim = n_vars; /* Only numeric variables are allowed in a single 
data pass,
     +                  so these values are equal.
     +                */
      
        for (i = 0; i < n_vars; ++i)
     -    cov->vars[i] = vars[i];
     +    {
     +      assert (var_is_numeric (vars[i]));
     +      cov->vars[i] = vars[i];
     +    }
      
     -  cov->moments = xmalloc (sizeof *cov->moments * n_MOMENTS);
     -  
     -  for (i = 0; i < n_MOMENTS; ++i)
     -    cov->moments[i] = gsl_matrix_calloc (n_vars, n_vars);
     +  covariance_create_part2 (cov, exclude);
      
     -  cov->exclude = exclude;
     +  return cov;
     +}
      
     -  cov->n_cm = (n_vars * (n_vars - 1)  ) / 2;
     +static size_t
     +get_dim (size_t n_vars, struct variable **vars)
     +{
     +  size_t i;
     +  size_t dim = 0;
      
     -  cov->cm = xcalloc (sizeof *cov->cm, cov->n_cm);
     +  for (i = 0; i < n_vars; i++)
     +    {
     +      if (var_is_numeric (vars[i]))
     +  {
     +    dim++;
     +  }
     +      else
     +  {
     +    dim += cat_get_n_categories (vars[i]);
     +  }
     +    }
     +  return dim;
     +}
     +/* Create a covariance struct with categorical variables.
     +   Call this function after the first data pass.
     +*/
     +struct covariance *
     +covariance_create_pass2 (size_t n_vars, const struct variable **vars,
     +                   const struct variable *weight, enum mv_class exclude)
     +{
     +  size_t i;
     +  struct covariance *cov = xmalloc (sizeof *cov);
     +  cov->vars = xmalloc (sizeof *cov->vars * n_vars);
     +
     +  cov->wv = weight;
     +  cov->n_vars = n_vars;
     +
     +  for (i = 0; i < n_vars; ++i)
     +    cov->vars[i] = vars[i];
     +
     +  cov->dim = get_dim (n_vars, vars);
     +
     +  
     +  covariance_create_part2 (cov, exclude);
      
        return cov;
      }
     diff --git a/src/math/covariance.h b/src/math/covariance.h
     index 8b8de88..7a13cd2 100644
     --- a/src/math/covariance.h
     +++ b/src/math/covariance.h
     @@ -27,8 +27,14 @@ struct covariance;
      struct variable;
      struct ccase ;
      
     -struct covariance * covariance_create (size_t n_vars, const struct 
variable **vars, 
     -                                 const struct variable *wv, enum mv_class 
excl);
     +struct covariance * covariance_create_pass1 (size_t n_vars, 
     +                                       const struct variable **vars,
     +                                       const struct variable *wv, 
     +                                       enum mv_class excl);
     +struct covariance * covariance_create_pass2 (size_t n_vars, 
     +                                       const struct variable **vars,
     +                                       const struct variable *wv, 
     +                                       enum mv_class excl);
      
      void covariance_accumulate (struct covariance *, const struct ccase *);
      
     
     
     _______________________________________________
     pspp-dev mailing list
     address@hidden
     http://lists.gnu.org/mailman/listinfo/pspp-dev

-- 
PGP Public key ID: 1024D/2DE827B3 
fingerprint = 8797 A26D 0854 2EAB 0285  A290 8A67 719C 2DE8 27B3
See http://pgp.mit.edu or any PGP keyserver for public key.


Attachment: signature.asc
Description: Digital signature


reply via email to

[Prev in Thread] Current Thread [Next in Thread]