Re: [patch 16/19] Implement FIRST and LAST on MATCH FILES.

pspp-dev
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [patch 16/19] Implement FIRST and LAST on MATCH FILES.

From:	John Darrington
Subject:	Re: [patch 16/19] Implement FIRST and LAST on MATCH FILES.
Date:	Thu, 7 Jun 2007 11:02:49 +0800
User-agent:	Mutt/1.5.13 (2006-08-11)
I can't see any problems here.

On Tue, Jun 05, 2007 at 11:27:43PM -0700, address@hidden wrote:
     Largely rewrites MATCH FILES to implement FIRST and LAST.  Also adds
     tests and documentation.
     
     Index: merge/src/language/data-io/get.c
     ===================================================================
     --- merge.orig/src/language/data-io/get.c  2007-06-05 17:31:50.000000000 
-0700
     +++ merge/src/language/data-io/get.c       2007-06-05 17:54:11.000000000 
-0700
     @@ -646,89 +646,105 @@
      /* MATCH FILES. */
      
      /* File types. */
     -enum
     +enum mtf_type
        {
          MTF_FILE,                     /* Specified on FILE= subcommand. */
          MTF_TABLE                     /* Specified on TABLE= subcommand. */
        };
      
     -/* One of the files on MATCH FILES. */
     +/* One of the FILEs or TABLEs on MATCH FILES. */
      struct mtf_file
        {
     -    struct mtf_file *next, *prev; /* Next, previous in the list of files. 
*/
     -    struct mtf_file *next_min;    /* Next in the chain of minimums. */
     -    
     -    int type;                     /* One of MTF_*. */
     -    const struct variable **by;   /* List of BY variables for this file. 
*/
     -    struct file_handle *handle; /* File handle. */
     -    struct casereader *reader;  /* File reader. */
     -    struct dictionary *dict;      /* Dictionary from system file. */
     -    bool active_file;           /* Active file? */
     +    struct ll ll;               /* In list of all files and tables. */
     +
     +    enum mtf_type type;
     +    int sequence;
     +
     +    struct variable **by;       /* List of BY variables for this file. */
     +    struct mtf_variable *vars;  /* Variables to copy to output. */
     +    size_t var_cnt;             /* Number of other variables. */
     +
     +    struct file_handle *handle; /* Input file handle. */
     +    struct dictionary *dict;      /* Input file dictionary. */
     +    struct casereader *reader;  /* Input reader. */
     +    struct ccase input;         /* Input record (null at end of file). */
      
          /* IN subcommand. */
          char *in_name;              /* Variable name. */
          struct variable *in_var;    /* Variable (in master dictionary). */
     +  };
      
     -    struct ccase input;         /* Input record. */
     +struct mtf_variable 
     +  {
     +    struct variable *in_var;
     +    struct variable *out_var;
        };
      
      /* MATCH FILES procedure. */
      struct mtf_proc 
        {
     -    struct mtf_file *head;      /* First file mentioned on FILE or TABLE. 
*/
     -    struct mtf_file *tail;      /* Last file mentioned on FILE or TABLE. 
*/
     +    struct ll_list files;       /* List of "struct mtf_file"s. */
     +    int nonempty_files;         /* FILEs that are not at end-of-file. */
      
          bool ok;                    /* False if I/O error occurs. */
      
     -    size_t by_cnt;              /* Number of variables on BY subcommand. 
*/
     -
     -    /* Names of FIRST, LAST variables. */
     -    char first[LONG_NAME_LEN + 1], last[LONG_NAME_LEN + 1];
     -    
          struct dictionary *dict;    /* Dictionary of output file. */
          struct casewriter *output;  /* MATCH FILES output. */
     -    struct ccase mtf_case;      /* Case used for output. */
      
     -    unsigned seq_num;           /* Have we initialized this variable? */
     -    unsigned *seq_nums;         /* Sequence numbers for each var in dict. 
*/
     +    size_t by_cnt;              /* Number of variables on BY subcommand. 
*/
     +
     +    /* FIRST, LAST.
     +       Only if "first" or "last" is nonnull are the remaining
     +       members used. */
     +    struct variable *first;     /* Variable specified on FIRST (if any). 
*/
     +    struct variable *last;      /* Variable specified on LAST (if any). */
     +    struct ccase buffered_case; /* Case ready for output except that we 
don't
     +                                   know the value for the LAST variable 
yet. */
     +    struct ccase prev_BY_case;  /* Case with values of last set of BY 
vars. */
     +    struct variable **prev_BY;  /* Last set of BY variables. */
        };
      
     -static bool mtf_free (struct mtf_proc *);
     -static bool mtf_close_file (struct mtf_file *);
     +static void mtf_free (struct mtf_proc *);
     +
      static bool mtf_close_all_files (struct mtf_proc *);
     -static int mtf_merge_dictionary (struct dictionary *const, struct 
mtf_file *);
     -static bool mtf_read_records (struct mtf_proc *);
     -static bool mtf_delete_file_in_place (struct mtf_proc *, struct mtf_file 
**);
     +static bool mtf_merge_dictionary (struct dictionary *const, struct 
mtf_file *);
     +static bool mtf_read_record (struct mtf_proc *mtf, struct mtf_file *);
      
     -static bool mtf_processing (struct mtf_proc *);
     +static void mtf_process_case (struct mtf_proc *);
      
     +static bool create_flag_var (const char *subcommand_name, const char 
*var_name,
     +                             struct dictionary *, struct variable **);
      static char *var_type_description (struct variable *);
      
     -static void set_master (struct variable *, struct variable *master);
     -static struct variable *get_master (struct variable *);
     -
      /* Parse and execute the MATCH FILES command. */
      int
      cmd_match_files (struct lexer *lexer, struct dataset *ds)
      {
        struct mtf_proc mtf;
     -  struct mtf_file *first_table = NULL;
     -  struct mtf_file *iter;
     +  struct ll *first_table;
     +  struct mtf_file *file, *next;
        
     -  bool used_active_file = false;
     -  bool saw_table = false;
        bool saw_in = false;
     -  bool open_active_file = false;
     +  struct casereader *active_file = NULL;
      
     -  mtf.head = mtf.tail = NULL;
     -  mtf.by_cnt = 0;
     -  mtf.first[0] = '\0';
     -  mtf.last[0] = '\0';
     +  char first_name[LONG_NAME_LEN + 1] = "";
     +  char last_name[LONG_NAME_LEN + 1] = "";
     +
     +  struct taint *taint = NULL;
     +
     +  size_t i;
     +
     +  ll_init (&mtf.files);
     +  mtf.nonempty_files = 0;
     +  first_table = ll_null (&mtf.files);
        mtf.dict = dict_create ();
        mtf.output = NULL;
     -  case_nullify (&mtf.mtf_case);
     -  mtf.seq_num = 0;
     -  mtf.seq_nums = NULL;
     +  mtf.by_cnt = 0;
     +  mtf.first = mtf.last = NULL;
     +  case_nullify (&mtf.buffered_case);
     +  case_nullify (&mtf.prev_BY_case);
     +  mtf.prev_BY = NULL;
     +
        dict_set_case_limit (mtf.dict, dict_get_case_limit (dataset_dict (ds)));
      
        lex_match (lexer, '/');
     @@ -737,65 +753,35 @@
                   || lex_id_match (ss_cstr ("TABLE"), ss_cstr (lex_tokid 
(lexer)))))
          {
            struct mtf_file *file = xmalloc (sizeof *file);
     -
     -      if (lex_match_id (lexer, "FILE"))
     -        file->type = MTF_FILE;
     -      else if (lex_match_id (lexer, "TABLE"))
     -        {
     -          file->type = MTF_TABLE;
     -          saw_table = true;
     -        }
     -      else
     -        NOT_REACHED ();
     -      lex_match (lexer, '=');
     -
            file->by = NULL;
            file->handle = NULL;
            file->reader = NULL;
            file->dict = NULL;
            file->in_name = NULL;
            file->in_var = NULL;
     -      file->active_file = false;
     +      file->var_cnt = 0;
     +      file->vars = NULL;
            case_nullify (&file->input);
      
     -      /* FILEs go first, then TABLEs. */
     -      if (file->type == MTF_TABLE || first_table == NULL)
     +      if (lex_match_id (lexer, "FILE")) 
              {
     -          file->next = NULL;
     -          file->prev = mtf.tail;
     -          if (mtf.tail)
     -            mtf.tail->next = file;
     -          mtf.tail = file;
     -          if (mtf.head == NULL)
     -            mtf.head = file;
     -          if (file->type == MTF_TABLE && first_table == NULL)
     -            first_table = file;
     +          file->type = MTF_FILE;
     +          ll_insert (first_table, &file->ll);
     +          mtf.nonempty_files++;
              }
     -      else 
     +      else if (lex_match_id (lexer, "TABLE")) 
              {
     -          assert (file->type == MTF_FILE);
     -          file->next = first_table;
     -          file->prev = first_table->prev;
     -          if (first_table->prev)
     -            first_table->prev->next = file;
     -          else
     -            mtf.head = file;
     -          first_table->prev = file;
     +          file->type = MTF_TABLE;
     +          ll_push_tail (&mtf.files, &file->ll);
     +          if (first_table == ll_null (&mtf.files))
     +            first_table = &file->ll; 
              }
     +      else
     +        NOT_REACHED ();
     +      lex_match (lexer, '=');
      
            if (lex_match (lexer, '*'))
              {
     -          file->handle = NULL;
     -          file->reader = NULL;
     -              
     -          if (used_active_file)
     -            {
     -              msg (SE, _("The active file may not be specified more "
     -                         "than once."));
     -              goto error;
     -            }
     -          used_active_file = true;
     -
                if (!proc_has_active_file (ds))
                  {
                    msg (SE, _("Cannot specify the active file since no active "
     @@ -809,8 +795,7 @@
                         "the active file is an input source.  "
                         "Temporary transformations will be made permanent."));
      
     -          file->dict = dataset_dict (ds);
     -          file->active_file = true;
     +          file->dict = dict_clone (dataset_dict (ds));
              }
            else
              {
     @@ -856,67 +841,74 @@
          {
            if (lex_match (lexer, T_BY))
        {
     -          const struct variable **by;
     +          struct mtf_file *file;
     +          struct variable **by;
     +          bool ok;
                
          if (mtf.by_cnt)
            {
     -        msg (SE, _("BY may appear at most once."));
     +              lex_sbc_only_once ("BY");
              goto error;
            }
              
          lex_match (lexer, '=');
     -    if (!parse_variables_const (lexer, mtf.dict, &by, &mtf.by_cnt,
     +    if (!parse_variables (lexer, mtf.dict, &by, &mtf.by_cnt,
                                PV_NO_DUPLICATE | PV_NO_SCRATCH))
            goto error;
      
     -          for (iter = mtf.head; iter != NULL; iter = iter->next)
     +          ok = true;
     +          ll_for_each (file, struct mtf_file, ll, &mtf.files)
                  {
                    size_t i;
          
     -              iter->by = xnmalloc (mtf.by_cnt, sizeof *iter->by);
     -
     +              file->by = xnmalloc (mtf.by_cnt, sizeof *file->by);
                    for (i = 0; i < mtf.by_cnt; i++)
                      {
     -                  iter->by[i] = dict_lookup_var (iter->dict,
     -                                                 var_get_name (by[i]));
     -                  if (iter->by[i] == NULL)
     +                  const char *var_name = var_get_name (by[i]);
     +                  file->by[i] = dict_lookup_var (file->dict, var_name);
     +                  if (file->by[i] == NULL)
                          {
     -                      msg (SE, _("File %s lacks BY variable %s."),
     -                           iter->handle ? fh_get_name (iter->handle) : 
"*",
     -                           var_get_name (by[i]));
     -                      free (by);
     -                      goto error;
     +                      if (file->handle != NULL)
     +                        msg (SE, _("File %s lacks BY variable %s."),
     +                             fh_get_name (file->handle), var_name);
     +                      else
     +                        msg (SE, _("Active file lacks BY variable %s."),
     +                             var_name);
     +                      ok = false;
                          }
                      }
                  }
                free (by);
     +
     +          if (!ok)
     +            goto error;
        }
            else if (lex_match_id (lexer, "FIRST")) 
              {
     -          if (mtf.first[0] != '\0')
     +          if (first_name[0] != '\0')
                  {
     -              msg (SE, _("FIRST may appear at most once."));
     +              lex_sbc_only_once ("FIRST");
                    goto error;
                  }
              
          lex_match (lexer, '=');
                if (!lex_force_id (lexer))
                  goto error;
     -          strcpy (mtf.first, lex_tokid (lexer));
     +          strcpy (first_name, lex_tokid (lexer));
                lex_get (lexer);
              }
            else if (lex_match_id (lexer, "LAST")) 
              {
     -          if (mtf.last[0] != '\0')
     +          if (last_name[0] != '\0')
                  {
     -              msg (SE, _("LAST may appear at most once."));
     +              lex_sbc_only_once ("LAST");
                    goto error;
                  }
              
          lex_match (lexer, '=');
                if (!lex_force_id (lexer))
                  goto error;
     -          strcpy (mtf.last, lex_tokid (lexer));
     +          strcpy (last_name, lex_tokid (lexer));
                lex_get (lexer);
              }
            else if (lex_match_id (lexer, "MAP"))
     @@ -948,7 +940,7 @@
      
        if (mtf.by_cnt == 0)
          {
     -      if (saw_table)
     +      if (first_table != ll_null (&mtf.files))
              {
                msg (SE, _("BY is required when TABLE is specified."));
                goto error;
     @@ -962,101 +954,115 @@
      
        /* Set up mapping from each file's variables to master
           variables. */
     -  for (iter = mtf.head; iter != NULL; iter = iter->next)
     +  ll_for_each (file, struct mtf_file, ll, &mtf.files) 
          {
     -      struct dictionary *d = iter->dict;
     -      int i;
     +      size_t in_var_cnt = dict_get_var_cnt (file->dict);
      
     -      for (i = 0; i < dict_get_var_cnt (d); i++)
     +      file->vars = xnmalloc (in_var_cnt, sizeof *file->vars);
     +      file->var_cnt = 0;
     +      for (i = 0; i < in_var_cnt; i++) 
              {
     -          struct variable *v = dict_get_var (d, i);
     -          struct variable *mv = dict_lookup_var (mtf.dict, var_get_name 
(v));
     -          if (mv != NULL)
     -            set_master (v, mv);
     +          struct variable *in_var = dict_get_var (file->dict, i);
     +          struct variable *out_var = dict_lookup_var (mtf.dict,
     +                                                      var_get_name 
(in_var));
     +
     +          if (out_var != NULL) 
     +            {
     +              struct mtf_variable *mv = &file->vars[file->var_cnt++];
     +              mv->in_var = in_var;
     +              mv->out_var = out_var;
     +            } 
              }
          }
      
     -  /* Add IN variables to master dictionary. */
     -  for (iter = mtf.head; iter != NULL; iter = iter->next) 
     -    if (iter->in_name != NULL)
     -      {
     -        struct fmt_spec format = fmt_for_output (FMT_F, 1, 0);
     -        iter->in_var = dict_create_var (mtf.dict, iter->in_name, 0);
     -        if (iter->in_var == NULL)
     -          {
     -            msg (SE, _("IN variable name %s duplicates an "
     -                       "existing variable name."),
     -                 var_get_name (iter->in_var));
     -            goto error;
     -          }
     -        var_set_both_formats (iter->in_var, &format);
     -      }
     -    
     -  /* MATCH FILES performs an n-way merge on all its input files.
     -     Abstract algorithm:
     -
     -     1. Read one input record from every input FILE.
     -
     -     2. If no FILEs are left, stop.  Otherwise, proceed to step 3.
     -
     -     3. Find the FILE input record(s) that have minimum BY
     -     values.  Store all the values from these input records into
     -     the output record.
     -
     -     4. For every TABLE, read another record as long as the BY values
     -     on the TABLE's input record are less than the FILEs' BY values.
     -     If an exact match is found, store all the values from the TABLE
     -     input record into the output record.
     -
     -     5. Write the output record.
     -
     -     6. Read another record from each input file FILE and TABLE that
     -     we stored values from above.  If we come to the end of one of the
     -     input files, remove it from the list of input files.
     -
     -     7. Repeat from step 2.
     -
     -     FIXME: For merging large numbers of files (more than 10?) a
     -     better algorithm would use a heap for finding minimum
     -     values. */
     -
     -  if (used_active_file) 
     -    {
     -      proc_discard_output (ds);
     -      for (iter = mtf.head; iter != NULL; iter = iter->next)
     -        if (iter->reader == NULL) 
     -          iter->reader = proc_open (ds);
     -      open_active_file = true;
     -    }
     +  /* Add IN, FIRST, and LAST variables to master dictionary. */
     +  ll_for_each (file, struct mtf_file, ll, &mtf.files)
     +    if (!create_flag_var ("IN", file->in_name, mtf.dict, &file->in_var))
     +      goto error;
     +  if (!create_flag_var ("FIRST", first_name, mtf.dict, &mtf.first)
     +      || !create_flag_var ("LAST", last_name, mtf.dict, &mtf.last))
     +    goto error;
      
        dict_compact_values (mtf.dict);
        mtf.output = autopaging_writer_create (dict_get_next_value_idx 
(mtf.dict));
     -  mtf.seq_nums = xcalloc (dict_get_var_cnt (mtf.dict), sizeof 
*mtf.seq_nums);
     -  case_create (&mtf.mtf_case, dict_get_next_value_idx (mtf.dict));
     +  taint = taint_clone (casewriter_get_taint (mtf.output));
      
     -  if (!mtf_read_records (&mtf)) 
     -    goto error; 
     -  while (mtf.head && mtf.head->type == MTF_FILE)
     -    if (!mtf_processing (&mtf))
     -      goto error; 
     -  if (!mtf_close_all_files (&mtf))
     -    goto error;
     -  if (open_active_file)
     +  ll_for_each (file, struct mtf_file, ll, &mtf.files) 
     +    {
     +      if (file->reader == NULL) 
     +        {
     +          if (active_file == NULL) 
     +            {
     +              proc_discard_output (ds);
     +              file->reader = active_file = proc_open (ds);
     +            }
     +          else
     +            file->reader = casereader_clone (active_file);
     +        }
     +      taint_propagate (casereader_get_taint (file->reader), taint);
     +    }
     +  
     +  ll_for_each_safe (file, next, struct mtf_file, ll, &mtf.files)
     +    mtf_read_record (&mtf, file);
     +  while (mtf.nonempty_files > 0)
     +    mtf_process_case (&mtf);
     +  if ((mtf.first != NULL || mtf.last != NULL) && mtf.prev_BY != NULL) 
     +    {
     +      if (mtf.last != NULL)
     +        case_data_rw (&mtf.buffered_case, mtf.last)->f = 1.0;
     +      casewriter_write (mtf.output, &mtf.buffered_case);
     +      case_nullify (&mtf.buffered_case);
     +    }
     +  mtf_close_all_files (&mtf);
     +  if (active_file != NULL)
          proc_commit (ds);
      
        proc_set_active_file (ds, casewriter_make_reader (mtf.output), 
mtf.dict);
        mtf.dict = NULL;
        mtf.output = NULL;
      
     -  return mtf_free (&mtf) ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
     +  mtf_free (&mtf);
     +
     +  return taint_destroy (taint) ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
      
       error:
     -  if (open_active_file)
     +  if (active_file != NULL)
          proc_commit (ds);
        mtf_free (&mtf);
     +  taint_destroy (taint);
        return CMD_CASCADING_FAILURE;
      }
      
     +/* If VAR_NAME is a nonnull pointer to a non-empty string,
     +   attempts to create a variable named VAR_NAME, with format
     +   F1.0, in DICT, and stores a pointer to the variable in *VAR.
     +   Returns true if successful, false if the variable name is a
     +   duplicate (in which case a message saying that the variable
     +   specified on the given SUBCOMMAND is a duplicate is emitted).
     +   Also returns true, without doing anything, if VAR_NAME is null
     +   or empty. */
     +static bool
     +create_flag_var (const char *subcommand, const char *var_name,
     +                 struct dictionary *dict, struct variable **var)
     +{
     +  if (var_name != NULL && var_name[0] != '\0') 
     +    {
     +      struct fmt_spec format = fmt_for_output (FMT_F, 1, 0);
     +      *var = dict_create_var (dict, var_name, 0);
     +      if (*var == NULL)
     +        {
     +          msg (SE, _("Variable name %s specified on %s subcommand "
     +                     "duplicates an existing variable name."),
     +               subcommand, var_name);
     +          return false;
     +        }
     +      var_set_both_formats (*var, &format);
     +    }
     +  else 
     +    *var = NULL;
     +  return true;
     +}
     +
      /* Return a string in an allocated buffer describing V's variable
         type and width. */
      static char *
     @@ -1068,118 +1074,58 @@
          return xasprintf ("string with width %d", var_get_width (v));
      }
      
     -/* Closes FILE and frees its associated data.
     -   Returns true if successful, false if an I/O error
     -   occurred on FILE. */
     -static bool
     -mtf_close_file (struct mtf_file *file)
     -{
     -  bool ok = casereader_destroy (file->reader);
     -  free (file->by);
     -  if (!file->active_file)
     -    dict_destroy (file->dict);
     -  free (file->in_name);
     -  case_destroy (&file->input);
     -  free (file);
     -  return ok;
     -}
     -
     +/* Closes all the files in MTF and frees their associated data.
     +   Returns true if successful, false if an I/O error occurred on
     +   any of the files. */
      static bool
      mtf_close_all_files (struct mtf_proc *mtf) 
      {
     -  struct mtf_file *iter, *next;
     +  struct mtf_file *file;
        bool ok = true;
      
     -  for (iter = mtf->head; iter; iter = next)
     +  ll_for_each_preremove (file, struct mtf_file, ll, &mtf->files)
          {
     -      next = iter->next;
     -      assert (iter->dict != mtf->dict);
     -      if (!mtf_close_file (iter))
     -        ok = false;
     +      casereader_destroy (file->reader);
     +      free (file->by);
     +      dict_destroy (file->dict);
     +      free (file->in_name);
     +      case_destroy (&file->input);
     +      free (file->vars);
     +      free (file);
          }
     -  mtf->head = NULL;
     +
        return ok;
      }
      
     -/* Free all the data for the MATCH FILES procedure.
     -   Returns true if successful, false if an I/O error
     -   occurred. */
     -static bool
     +/* Frees all the data for the MATCH FILES procedure. */
     +static void
      mtf_free (struct mtf_proc *mtf)
      {
     -  bool ok;
     -
     -  ok = mtf_close_all_files (mtf);
     -
     -  casewriter_destroy (mtf->output);
     +  mtf_close_all_files (mtf);
        dict_destroy (mtf->dict);
     -  case_destroy (&mtf->mtf_case);
     -  free (mtf->seq_nums);
     -
     -  return ok;
     -}
     -
     -/* Remove *FILE from the mtf_file chain.  Make *FILE point to the next
     -   file in the chain, or to NULL if was the last in the chain.
     -   Returns true if successful, false if an I/O error occurred. */
     -static bool
     -mtf_delete_file_in_place (struct mtf_proc *mtf, struct mtf_file **file)
     -{
     -  struct mtf_file *f = *file;
     -  int i;
     -
     -  if (f->prev)
     -    f->prev->next = f->next;
     -  if (f->next)
     -    f->next->prev = f->prev;
     -  if (f == mtf->head)
     -    mtf->head = f->next;
     -  if (f == mtf->tail)
     -    mtf->tail = f->prev;
     -  *file = f->next;
     -
     -  if (f->in_var != NULL)
     -    case_data_rw (&mtf->mtf_case, f->in_var)->f = 0.;
     -  for (i = 0; i < dict_get_var_cnt (f->dict); i++)
     -    {
     -      struct variable *v = dict_get_var (f->dict, i);
     -      struct variable *mv = get_master (v);
     -      if (mv != NULL) 
     -        {
     -          union value *out = case_data_rw (&mtf->mtf_case, mv);
     -    
     -          if (var_is_numeric (v))
     -            out->f = SYSMIS;
     -          else
     -            memset (out->s, ' ', var_get_width (v));
     -        } 
     -    }
     -
     -  return mtf_close_file (f);
     +  casewriter_destroy (mtf->output);
     +  case_destroy (&mtf->buffered_case);
     +  case_destroy (&mtf->prev_BY_case);
      }
      
     -/* Read a record from every input file.
     -   Returns true if successful, false if an I/O error occurred. */
     +/* Reads the next record into FILE, if possible, and update MTF's
     +   nonempty_files count if not. */
      static bool
     -mtf_read_records (struct mtf_proc *mtf)
     +mtf_read_record (struct mtf_proc *mtf, struct mtf_file *file)
      {
     -  struct mtf_file *iter, *next;
     -  bool ok = true;
     -
     -  for (iter = mtf->head; ok && iter != NULL; iter = next)
     +  case_destroy (&file->input);
     +  if (!casereader_read (file->reader, &file->input))
          {
     -      next = iter->next;
     -      if (!casereader_read (iter->reader, &iter->input))
     -        {
     -          if (!mtf_delete_file_in_place (mtf, &iter))
     -            ok = false; 
     -        }
     +      mtf->nonempty_files--;
     +      return false;
          }
     -  return ok;
     +  else
     +    return true;
      }
      
     -/* Compare the BY variables for files A and B; return -1 if A < B, 0
     -   if A == B, 1 if A > B. */
     +/* Compare the BY variables for files A and B; return -1 if A <
     +   B, 0 if A == B, 1 if A > B.  (If there are no BY variables,
     +   then all records are equal.) */
      static inline int
      mtf_compare_BY_values (struct mtf_proc *mtf,
                             struct mtf_file *a, struct mtf_file *b)
     @@ -1187,163 +1133,133 @@
        return case_compare_2dict (&a->input, &b->input, a->by, b->by, 
mtf->by_cnt);
      }
      
     -/* Perform one iteration of steps 3...7 above.
     -   Returns true if successful, false if an I/O error occurred. */
     -static bool
     -mtf_processing (struct mtf_proc *mtf)
     +/* Processes input files and write one case to the output file. */
     +static void
     +mtf_process_case (struct mtf_proc *mtf)
      {
     -  struct mtf_file *min_head, *min_tail; /* Files with minimum BY values. 
*/
     -  struct mtf_file *max_head, *max_tail; /* Files with non-minimum BYs. */
     -  struct mtf_file *iter, *next;
     -  struct ccase out_case;
     -
     -  /* 3. Find the FILE input record(s) that have minimum BY
     -     values.  Store all the values from these input records into
     -     the output record. */
     -  min_head = min_tail = mtf->head;
     -  max_head = max_tail = NULL;
     -  for (iter = mtf->head->next; iter && iter->type == MTF_FILE;
     -       iter = iter->next) 
     -    {
     -      int cmp = mtf_compare_BY_values (mtf, min_head, iter);
     -      if (cmp < 0) 
     -        {
     -          if (max_head)
     -            max_tail = max_tail->next_min = iter;
     -          else
     -            max_head = max_tail = iter;
     -        }
     -      else if (cmp == 0) 
     -        min_tail = min_tail->next_min = iter;
     -      else /* cmp > 0 */
     -        {
     -          if (max_head)
     -            {
     -              max_tail->next_min = min_head;
     -              max_tail = min_tail;
     -            }
     -          else
     -            {
     -              max_head = min_head;
     -              max_tail = min_tail;
     -            }
     -          min_head = min_tail = iter;
     -        }
     +  struct ccase c;
     +  struct mtf_file *min;
     +  struct mtf_file *file;
     +  int min_sequence;
     +  size_t i;
     +
     +  /* Find the set of one or more FILEs whose BY values are
     +     minimal, as well as the set of zero or more TABLEs whose BY
     +     values equal those of the minimum FILEs.
     +
     +     After each iteration of the loop, this invariant holds: the
     +     FILEs with minimum BY values thus far have "sequence"
     +     members equal to min_sequence, and "min" points to one of
     +     the mtf_files whose case has those minimum BY values, and
     +     similarly for TABLEs. */
     +  min_sequence = 0;
     +  min = NULL;
     +  ll_for_each (file, struct mtf_file, ll, &mtf->files)
     +    if (case_is_null (&file->input))
     +      file->sequence = -1;
     +    else if (file->type == MTF_FILE) 
     +      {
     +        int cmp = min != NULL ? mtf_compare_BY_values (mtf, min, file) : 
1;
     +        if (cmp <= 0)
     +          file->sequence = cmp < 0 ? -1 : min_sequence;
     +        else 
     +          {
     +            file->sequence = ++min_sequence;
     +            min = file;
     +          }
     +      }
     +    else
     +      {
     +        int cmp;
     +        assert (min != NULL);
     +        do
     +          {
     +            cmp = mtf_compare_BY_values (mtf, min, file);
     +          }
     +        while (cmp > 0 && mtf_read_record (mtf, file));
     +        file->sequence = cmp == 0 ? min_sequence : -1;
     +      }      
     +
     +  /* Form the output case from the input cases. */
     +  case_create (&c, dict_get_next_value_idx (mtf->dict));
     +  for (i = 0; i < dict_get_var_cnt (mtf->dict); i++) 
     +    {
     +      struct variable *v = dict_get_var (mtf->dict, i);
     +      value_set_missing (case_data_rw (&c, v), var_get_width (v)); 
     +    }
     +  ll_for_each_reverse (file, struct mtf_file, ll, &mtf->files)
     +    {
     +      bool include_file = file->sequence == min_sequence;
     +      if (include_file) 
     +        for (i = 0; i < file->var_cnt; i++)
     +          {
     +            const struct mtf_variable *mv = &file->vars[i];
     +            const union value *in = case_data (&file->input, mv->in_var);
     +            union value *out = case_data_rw (&c, mv->out_var);
     +            value_copy (out, in, var_get_width (mv->in_var)); 
     +          }
     +      if (file->in_var != NULL) 
     +        case_data_rw (&c, file->in_var)->f = include_file;
          }
     -      
     -  /* 4. For every TABLE, read another record as long as the BY
     -     values on the TABLE's input record are less than the FILEs'
     -     BY values.  If an exact match is found, store all the values
     -     from the TABLE input record into the output record. */
     -  for (; iter != NULL; iter = next)
     +  
     +  /* Write the output case. */
     +  if (mtf->first == NULL && mtf->last == NULL) 
          {
     -      assert (iter->type == MTF_TABLE);
     -      
     -      next = iter->next;
     -      for (;;) 
     -        {
     -          int cmp = mtf_compare_BY_values (mtf, min_head, iter);
     -          if (cmp < 0) 
     -            {
     -              if (max_head)
     -                max_tail = max_tail->next_min = iter;
     -              else
     -                max_head = max_tail = iter;
     -            }
     -          else if (cmp == 0)
     -            min_tail = min_tail->next_min = iter;
     -          else /* cmp > 0 */
     -            {
     -              case_destroy (&iter->input);
     -              if (casereader_read (iter->reader, &iter->input))
     -                continue;
     -              if (!mtf_delete_file_in_place (mtf, &iter))
     -                return false;
     -            }
     -          break;
     -        }
     +      /* With no FIRST or LAST variables, it's trivial. */
     +      casewriter_write (mtf->output, &c);
          }
     -
     -  /* Next sequence number. */
     -  mtf->seq_num++;
     -
     -  /* Store data to all the records we are using. */
     -  if (min_tail)
     -    min_tail->next_min = NULL;
     -  for (iter = min_head; iter; iter = iter->next_min)
     +  else 
          {
     -      int i;
     -
     -      for (i = 0; i < dict_get_var_cnt (iter->dict); i++)
     -        {
     -          struct variable *v = dict_get_var (iter->dict, i);
     -          struct variable *mv = get_master (v);
     -          size_t mv_index = mv ? var_get_dict_index (mv) : 0;
     -    
     -          if (mv != NULL && mtf->seq_nums[mv_index] != mtf->seq_num) 
     -            {
     -              union value *out = case_data_rw (&mtf->mtf_case, mv);
     -
     -              mtf->seq_nums[mv_index] = mtf->seq_num;
     -              if (var_is_numeric (v))
     -                out->f = case_num (&iter->input, v);
     -              else
     -                memcpy (out->s, case_str (&iter->input, v), var_get_width 
(v));
     -            } 
     +      /* It's harder with LAST, because we can't know whether
     +         this case is the last in a group until we've prepared
     +         the *next* case also.  Thus, we buffer the previous
     +         output case until the next one is ready.
     +
     +         We also have to save a copy of one of the previous input
     +         cases, so that we can compare the BY variables.  We
     +         can't compare the BY variables between the current
     +         output case and the saved one because the BY variables
     +         might not be in the output (the user is allowed to drop
     +         them). */
     +      bool new_BY;
     +      if (mtf->prev_BY != NULL) 
     +        {
     +          new_BY = case_compare_2dict (&min->input, &mtf->prev_BY_case,
     +                                       min->by, mtf->prev_BY,
     +                                       mtf->by_cnt);
     +          if (mtf->last != NULL)
     +            case_data_rw (&mtf->buffered_case, mtf->last)->f = new_BY;
     +          casewriter_write (mtf->output, &mtf->buffered_case);
              }
     -      if (iter->in_var != NULL)
     -        case_data_rw (&mtf->mtf_case, iter->in_var)->f = 1.;
     -    }
     +      else 
     +        new_BY = true;
      
     -  /* Store missing values to all the records we're not using. */
     -  if (max_tail)
     -    max_tail->next_min = NULL;
     -  for (iter = max_head; iter; iter = iter->next_min)
     -    {
     -      int i;
     +      case_move (&mtf->buffered_case, &c);
     +      if (mtf->first != NULL)
     +        case_data_rw (&mtf->buffered_case, mtf->first)->f = new_BY;
      
     -      for (i = 0; i < dict_get_var_cnt (iter->dict); i++)
     +      if (new_BY) 
              {
     -          struct variable *v = dict_get_var (iter->dict, i);
     -          struct variable *mv = get_master (v);
     -          size_t mv_index = mv ? var_get_dict_index (mv) : 0;
     -
     -          if (mv != NULL && mtf->seq_nums[mv_index] != mtf->seq_num) 
     -            {
     -              union value *out = case_data_rw (&mtf->mtf_case, mv);
     -              mtf->seq_nums[mv_index] = mtf->seq_num;
     -
     -              if (var_is_numeric (v))
     -                out->f = SYSMIS;
     -              else
     -                memset (out->s, ' ', var_get_width (v));
     -            }
     +          mtf->prev_BY = min->by;
     +          case_destroy (&mtf->prev_BY_case);
     +          case_clone (&mtf->prev_BY_case, &min->input); 
              }
     -      if (iter->in_var != NULL)
     -        case_data_rw (&mtf->mtf_case, iter->in_var)->f = 0.;
          }
      
     -  /* 5. Write the output record. */
     -  case_clone (&out_case, &mtf->mtf_case);
     -  casewriter_write (mtf->output, &out_case);
     -
     -  /* 6. Read another record from each input file FILE and TABLE
     -     that we stored values from above.  If we come to the end of
     -     one of the input files, remove it from the list of input
     -     files. */
     -  for (iter = min_head; iter && iter->type == MTF_FILE; iter = next)
     -    {
     -      next = iter->next_min;
     -      case_destroy (&iter->input);
     -      if (!casereader_read (iter->reader, &iter->input))
     -        if (!mtf_delete_file_in_place (mtf, &iter))
     -          return false;
     -    }
     -  return true;
     +  /* Read another record from each input file FILE with minimum
     +     values. */
     +  ll_for_each (file, struct mtf_file, ll, &mtf->files)
     +    if (file->type == MTF_FILE) 
     +      {
     +        if (file->sequence == min_sequence)
     +          mtf_read_record (mtf, file); 
     +      }
     +    else
     +      break;
      }
      
      /* Merge the dictionary for file F into master dictionary M. */
     -static int
     +static bool
      mtf_merge_dictionary (struct dictionary *const m, struct mtf_file *f)
      {
        struct dictionary *d = f->dict;
     @@ -1379,12 +1295,16 @@
              {
                if (var_get_width (mv) != var_get_width (dv)) 
                  {
     +              char *dv_description = var_type_description (dv);
     +              char *mv_description = var_type_description (mv);
                    msg (SE, _("Variable %s in file %s (%s) has different "
                               "type or width from the same variable in "
                               "earlier file (%s)."),
                         var_get_name (dv), fh_get_name (f->handle),
     -                   var_type_description (dv), var_type_description (mv));
     -              return 0;
     +                   dv_description, mv_description);
     +              free (dv_description);
     +              free (mv_description);
     +              return false;
                  }
              
                if (var_get_width (dv) == var_get_width (mv))
     @@ -1398,26 +1318,11 @@
                if (var_get_label (dv) && !var_get_label (mv))
                  var_set_label (mv, var_get_label (dv));
              }
     -      else
     +      else 
              mv = dict_clone_var_assert (m, dv, var_get_name (dv));
          }
      
     -  return 1;
     -}
     -
     -/* Marks V's master variable as MASTER. */
     -static void
     -set_master (struct variable *v, struct variable *master) 
     -{
     -  var_attach_aux (v, master, NULL);
     -}
     -
     -/* Returns the master variable corresponding to V,
     -   as set with set_master(). */
     -static struct variable *
     -get_master (struct variable *v) 
     -{
     -  return var_get_aux (v);
     +  return true;
      }
      
      /* Case map.
     Index: merge/doc/files.texi
     ===================================================================
     --- merge.orig/doc/files.texi      2007-06-05 17:31:50.000000000 -0700
     +++ merge/doc/files.texi   2007-06-05 17:33:35.000000000 -0700
     @@ -195,12 +195,12 @@
      @display
      MATCH FILES
              /@{FILE,address@hidden@{*,'file-name'@}
     -        /DROP=var_list
     -        /KEEP=var_list
              /RENAME=(src_names=target_names)@dots{}
              /IN=var_name
      
     -        /BY var_list
     +        /BY=var_list
     +        /DROP=var_list
     +        /KEEP=var_list
              /FIRST=var_name
              /LAST=var_name
              /MAP
     @@ -208,38 +208,33 @@
      
      @cmd{MATCH FILES} merges one or more system, portable, or scratch files,
      optionally
     -including the active file.  Records with the same values for BY
     -variables are combined into a single record.  Records with different
     +including the active file.  Cases with the same values for BY
     +variables are combined into a single case.  Cases with different
      values are output in order.  Thus, multiple sorted files are
      combined into a single sorted file based on the value of the BY
      variables.  The results of the merge become the new active file.
      
     -The BY subcommand specifies a list of variables that are used to match
     -records from each of the files.  Variables specified must exist
     -in all the files specified on FILE and TABLE.  BY should usually be
     -specified.  If TABLE or IN is used then BY is required.
     -
      Specify FILE with a system, portable, or scratch file as a file name
      string or file handle
      (@pxref{File Handles}), or with an asterisk (@samp{*}) to
      indicate the current active file.  The files specified on FILE are
      merged together based on the BY variables, or combined case-by-case if
     -BY is not specified.  Normally at least two FILE subcommands should be
     -specified.
     +BY is not specified.
      
      Specify TABLE with a file to use it as a @dfn{table
     -lookup file}.  Records in table lookup files are not used up after
     +lookup file}.  Cases in table lookup files are not used up after
      they've been used once.  This means that data in table lookup files can
     -correspond to any number of records in FILE files.  Table lookup files
     +correspond to any number of cases in FILE files.  Table lookup files
      correspond to lookup tables in traditional relational database systems.
     -It is incorrect to have records with duplicate BY values in table lookup
     -files.
     +If a table lookup file contains more than one case with a given set of
     +BY variables, only the first case is used.
      
     -Any number of FILE and TABLE subcommands may be specified.  Each
     -instance of FILE or TABLE can be followed by any sequence of DROP,
     -KEEP, or RENAME subcommands.  These have the same form and meaning as
     -the corresponding subcommands of @cmd{GET} (@pxref{GET}), but apply
     -only to variables in the given file.
     +Any number of FILE and TABLE subcommands may be specified.
     +Ordinarily, at least two FILE subcommands, or one FILE and at least
     +one TABLE, should be specified.  Each instance of FILE or TABLE can be
     +followed by any sequence of RENAME subcommands.  These have the same
     +form and meaning as the corresponding subcommands of @cmd{GET}
     +(@pxref{GET}), but apply only to variables in the given file.
      
      Each FILE or TABLE may optionally be followed by an IN subcommand,
      which creates a numeric variable with the specified name and format
     @@ -247,11 +242,38 @@
      contributed a row to the merged file, 0 otherwise.  The DROP, KEEP,
      and RENAME subcommands do not affect IN variables.
      
     -Variables belonging to files that are not present for the current case
     -are set to the system-missing value for numeric variables or spaces for
     -string variables.
     +When more than one FILE or TABLE contains a variable with a given
     +name, those variables must all have the same type (numeric or string)
     +and, for string variables, the same width.  This rules applies to
     +variable names after renaming with RENAME; thus, RENAME can be used to
     +resolve conflicts.
     +
     +FILE and TABLE must be specified at the beginning of the command, with
     +any RENAME or IN specifications immediately after the corresponding
     +FILE or TABLE.  These subcommands are followed by BY, DROP, KEEP,
     +FIRST, LAST, and MAP.
      
     -FIRST, LAST, and MAP are currently ignored.
     +The BY subcommand specifies a list of variables that are used to match
     +cases from each of the files.  When TABLE or IN is used, BY is
     +required; otherwise, it is optional.  When BY is specified, all the
     +files named on FILE and TABLE subcommands must be sorted in ascending
     +order of the BY variables.  Variables belonging to files that are not
     +present for the current case are set to the system-missing value for
     +numeric variables or spaces for string variables.
     +
     +The DROP and KEEP subcommands allow variables to be dropped from or
     +reordered within the new active file.  These subcommands have the same
     +form and meaning as the corresponding subcommands of @cmd{GET}
     +(@pxref{GET}).  They apply to the new active file as a whole, not to
     +individual input files.  The variable names specified on DROP and KEEP
     +are those after any renaming with RENAME.
     +
     +The optional FIRST and LAST subcommands name variables that @cmd{MATCH
     +FILES} adds to the active file.  The new variables are numeric with
     +print and write format F1.0.  The value of the FIRST variable is 1 in
     +the first case with a given set of values for the BY variables, and 0
     +in other cases.  Similarly, the LAST variable is 1 in the last case
     +with a given of BY values, and 0 in other cases.
      
      @cmd{MATCH FILES} may not be specified following @cmd{TEMPORARY}
      (@pxref{TEMPORARY}) if the active file is used as an input source.
     Index: merge/tests/command/match-files.sh
     ===================================================================
     --- merge.orig/tests/command/match-files.sh        2007-06-05 
17:31:50.000000000 -0700
     +++ merge/tests/command/match-files.sh     2007-06-05 17:33:35.000000000 
-0700
     @@ -82,40 +82,40 @@
      if [ $? -ne 0 ] ; then no_result ; fi
      
      cat > ff.out <<EOF
     -A B C D INA INB
     -- - - - --- ---
     -0 a A     1   0
     -1 a B N   1   1
     -1 a C     1   0
     -2 a D     1   0
     -3 a E O   1   1
     -4 a F P   1   1
     -5 a G     1   0
     -5 a H     1   0
     -6 a I Q   1   1
     -7 a J R   1   1
     -7 a K     1   0
     -7 a L     1   0
     -8 a M     1   0
     -9 b   S   0   1
     +A B C D INA INB FIRST LAST
     +- - - - --- --- ----- ----
     +0 a A     1   0     1    1
     +1 a B N   1   1     1    0
     +1 a C     1   0     0    1
     +2 a D     1   0     1    1
     +3 a E O   1   1     1    1
     +4 a F P   1   1     1    1
     +5 a G     1   0     1    0
     +5 a H     1   0     0    1
     +6 a I Q   1   1     1    1
     +7 a J R   1   1     1    0
     +7 a K     1   0     0    0
     +7 a L     1   0     0    1
     +8 a M     1   0     1    1
     +9 b   S   0   1     1    1
      EOF
      
      cat > ft.out <<EOF
     -A B C D INA INB
     -- - - - --- ---
     -0 a A     1   0
     -1 a B N   1   1
     -1 a C N   1   1
     -2 a D     1   0
     -3 a E O   1   1
     -4 a F P   1   1
     -5 a G     1   0
     -5 a H     1   0
     -6 a I Q   1   1
     -7 a J R   1   1
     -7 a K R   1   1
     -7 a L R   1   1
     -8 a M     1   0
     +A B C D INA INB FIRST LAST
     +- - - - --- --- ----- ----
     +0 a A     1   0     1    1
     +1 a B N   1   1     1    0
     +1 a C N   1   1     0    1
     +2 a D     1   0     1    1
     +3 a E O   1   1     1    1
     +4 a F P   1   1     1    1
     +5 a G     1   0     1    0
     +5 a H     1   0     0    1
     +6 a I Q   1   1     1    1
     +7 a J R   1   1     1    0
     +7 a K R   1   1     0    0
     +7 a L R   1   1     0    1
     +8 a M     1   0     1    1
      EOF
      
      # Test nonparallel match and table lookup.
     @@ -140,21 +140,24 @@
      $sa
      $dlb
      $sb
     -match files $type1='a.sys' /in=INA /$type2='b.sys' /in=INB /rename c=D 
/by a.
     +match files $type1='a.sys' /in=INA /$type2='b.sys' /in=INB /rename c=D 
/by a
     +            /first=FIRST /last=LAST.
      EOF
            elif [ $sources = sa ]; then
                cat <<EOF
      $dla
      $sa
      $dlb
     -match files $type1='a.sys' /in=INA /$type2=* /in=INB /rename c=D /by a.
     +match files $type1='a.sys' /in=INA /$type2=* /in=INB /rename c=D /by a
     +            /first=FIRST /last=LAST.
      EOF
            elif [ $sources = as ]; then
                cat <<EOF
      $dlb
      $sb
      $dla
     -match files $type1=* /in=INA /$type2='b.sys' /in=INB /rename c=D /by a.
     +match files $type1=* /in=INA /$type2='b.sys' /in=INB /rename c=D /by a
     +            /first=FIRST /last=LAST.
      EOF
            else
                activity="internal error"
     
     --
     
     
     
     _______________________________________________
     pspp-dev mailing list
     address@hidden
     http://lists.gnu.org/mailman/listinfo/pspp-dev

-- 
PGP Public key ID: 1024D/2DE827B3 
fingerprint = 8797 A26D 0854 2EAB 0285  A290 8A67 719C 2DE8 27B3
See http://pgp.mit.edu or any PGP keyserver for public key.
signature.asc
Description: Digital signature
[Prev in Thread]
Current Thread
[Next in Thread]
Re: [patch 01/19] move casenumber to case.h, (continued)
- [patch 13/19] add case initialization infrastructure, blp, 2007/06/06
  - Re: [patch 13/19] add case initialization infrastructure, John Darrington, 2007/06/06
    - Re: [patch 13/19] add case initialization infrastructure, Ben Pfaff, 2007/06/07
- [patch 17/19] Start writing developers guide., blp, 2007/06/06
  - Re: [patch 17/19] Start writing developers guide., John Darrington, 2007/06/06
- [patch 15/19] the big patch, blp, 2007/06/06
  - Re: [patch 15/19] the big patch, John Darrington, 2007/06/06
    - Re: [patch 15/19] the big patch, Ben Pfaff, 2007/06/07
- [patch 16/19] Implement FIRST and LAST on MATCH FILES., blp, 2007/06/06
  - Re: [patch 16/19] Implement FIRST and LAST on MATCH FILES., John Darrington <=
- [patch 07/19] casereader and casewriter implementation, blp, 2007/06/06
  - Re: [patch 07/19] casereader and casewriter implementation, John Darrington, 2007/06/06
    - Re: [patch 07/19] casereader and casewriter implementation, Ben Pfaff, 2007/06/07
- [patch 14/19] datasheet implementation, blp, 2007/06/06
  - Re: [patch 14/19] datasheet implementation, John Darrington, 2007/06/06
    - Re: [patch 14/19] datasheet implementation, Ben Pfaff, 2007/06/07
    - Re: [patch 14/19] datasheet implementation, Dr Eberhard Lisse, 2007/06/07
    - Re: [patch 14/19] datasheet implementation, Ben Pfaff, 2007/06/07
Prev by Date: Re: [patch 17/19] Start writing developers guide.
Next by Date: Re: [patch 01/19] move casenumber to case.h
Previous by thread: [patch 16/19] Implement FIRST and LAST on MATCH FILES.
Next by thread: [patch 07/19] casereader and casewriter implementation
Index(es):
- Date
- Thread